# Frequent vocabulary in Hyginus, *Fabulae*, not covered in L3 textbook

First run **Step 1: load everything**

Then, set values for vocabulary unit to include (cumulative vocabulary though this unit), and percentage of Hyginus to compare to.

E.g.:  setting `vocabUnit` to 4 and `pctOfHyginus` to 80 computes all the vocabulary items needed to cover 80% of the tokens in Hyginus that are **not** covered in units 1-4 of the L3 vocabulary lists, and shows you how many tokens in Hyginus are identified as possibly from this vocabulary item.

**NB**:  lexically ambiguous forms are counted under *all* possible lexemes, e.g., identical forms of the relative and interrogative pronouns are counted under *both*.  As a result, the total number of *possible occurrences* is greater than then total number of tokens in Hyginus.

# Step 2: display missing vocabulary

In [20]:
val pctOfHyginus = 80
val vocabUnit = 5
missingVocab(pctOfHyginus, vocabUnit)

ID:lemma,Occurrences
ls.n46498:sui,3768
ls.n25029:is,1935
ls.n47174:suus,786
ls.n49975:ut,218
ls.n15868:eo1,208
ls.n21343:idem,184
ls.n30423:nascor,139
ls.n41387:respondeo,138
ls.n4633:autem,136
ls.n42157:sacer,136


[36mpctOfHyginus[39m: [32mInt[39m = [32m80[39m
[36mvocabUnit[39m: [32mInt[39m = [32m5[39m

# Step 1: load everything


In [None]:
val vocabFiles : Map[Int, String] = Map(
  1 -> "https://raw.githubusercontent.com/LinguaLatina/textbook/master/vocablists/01-nouns-adjs-pron.cex",
  2 -> "https://raw.githubusercontent.com/LinguaLatina/textbook/master/vocablists/02-verbs.cex",
  3 -> "https://raw.githubusercontent.com/LinguaLatina/textbook/master/vocablists/03-place-and-time.cex",
  4 -> "https://raw.githubusercontent.com/LinguaLatina/textbook/master/vocablists/04-verbal-nouns-and-adjectives.cex",
  5 -> "https://raw.githubusercontent.com/LinguaLatina/textbook/master/vocablists/05-questions.cex"
)

In [None]:
// set up notebook to find repository
val personalRepo = coursierapi.MavenRepository.of("https://dl.bintray.com/neelsmith/maven")
interp.repositories() ++= Seq(personalRepo)

In [None]:
// ivy imports
import $ivy.`edu.holycross.shot::latincorpus:7.0.0-pr5`
import $ivy.`edu.holycross.shot::histoutils:2.3.0`
import $ivy.`edu.holycross.shot::tabulae:7.0.5`

In [None]:
import scala.io.Source
import edu.holycross.shot.latincorpus._
import edu.holycross.shot.histoutils._
import edu.holycross.shot.tabulae._

In [None]:
val hyginusUrl = "https://raw.githubusercontent.com/LinguaLatina/analysis/master/data/hyginus/hyginus-latc.cex"
val hyginus = LatinCorpus.fromUrl(hyginusUrl)

In [None]:
val lcCorpus = LatinCorpus(hyginus.tokens.filter(_.text.head.isLower))
val lexemesHist = lcCorpus.lexemesHistogram
val lsLexemesHist = Histogram(lexemesHist.frequencies.filterNot(_.item.startsWith("composites")))



In [None]:
def coverageForUnit(vocabUnit: Int): Vector[String] = {
  val vocab = for (i <- 1 to vocabUnit) yield {
    val lines = Source.fromURL(vocabFiles(i))
    val lexemeIds = lines.getLines.toVector.tail.filter(_.nonEmpty).map( ln => {
      val columns = ln.split("#")
      val idParts = columns.head.split(":")
      idParts.head
    })
    lexemeIds
  }
  vocab.toVector.flatten
}





In [19]:
val lewisShortBase = "http://folio2.furman.edu/lewis-short/index.html?urn=urn:cite2:hmt:ls.markdown:"



def missingVocab(pct: Int, vocabUnit: Int) = {
  val vocabList = coverageForUnit(vocabUnit)
  val topPct = lsLexemesHist.takePercent(pct)  
  val notCovered = topPct.filterNot( freq => vocabList.contains(freq.item))
  //val labelled = notCovered.map(f => Freq/uency(LewisShort.label(f.item), f.count))
  val rows = notCovered.map(freq => {
    val label = LewisShort.label(freq.item)
    val url = lewisShortBase + label.replaceFirst("ls.","")
    val link = "<a href=\"" + url + "\">"  + label + "</a>"
    s"<tr><td>${link}</td><td>${freq.count}</td></tr>"    
  })
  val tableHead = "<tr><th>ID:lemma</th><th>Occurrences</th></tr>"
  val header = "<h2>Most frequent vocabulary <em>not</em> covered</h2>"
  val para = s"<p>All vocabulary covering the top <strong>${pct}</strong>%" +
  " of tokens in Hyginus but <strong>not</strong> covered in L3 vocabulary " +
  s"lists <strong>through unit ${vocabUnit}</strong> " +
  s"(<strong>${vocabList.size}</strong> vocabulary items).  "
  "Identifiers are linked to Lewis-Short lexicon.</p>"
  
  val tableSummary = s"<h3>${rows.size} vocabulary items</h3>"
  
  Html(header + para  + "<table>" + tableSummary + tableHead + rows.mkString("\n") + "</table>")
}






[36mlewisShortBase[39m: [32mString[39m = [32m"http://folio2.furman.edu/lewis-short/index.html?urn=urn:cite2:hmt:ls.markdown:"[39m
defined [32mfunction[39m [36mmissingVocab[39m