# Textbook coverage of vocabulary in Hyginus, *Fabulae*

Read vocabulary lists for L3 textbook, compute percentage of total tokens in Hyginus (excluding proper names) covered by accumulated vocabulary for each unit.

# Step 2: display coverage

In [None]:
survey

# Step 1: load everything


In [None]:
// Survey all vocab files in this list:
val vocabFiles : Map[Int, String] = Map(
  1 -> "https://raw.githubusercontent.com/LinguaLatina/textbook/master/vocablists/01-nouns-adjs-pron.cex",
  2 -> "https://raw.githubusercontent.com/LinguaLatina/textbook/master/vocablists/02-verbs.cex",
  3 -> "https://raw.githubusercontent.com/LinguaLatina/textbook/master/vocablists/03-place-and-time.cex",
  4 -> "https://raw.githubusercontent.com/LinguaLatina/textbook/master/vocablists/04-verbal-nouns-and-adjectives.cex",
  5 -> "https://raw.githubusercontent.com/LinguaLatina/textbook/master/vocablists/05-questions.cex",
  6 -> "https://raw.githubusercontent.com/LinguaLatina/textbook/master/vocablists/06-indirect-statement.cex",
  7 -> "https://raw.githubusercontent.com/LinguaLatina/textbook/master/vocablists/07-miscellany.cex"
)

In [None]:
// set up notebook to find repository
val personalRepo = coursierapi.MavenRepository.of("https://dl.bintray.com/neelsmith/maven")
interp.repositories() ++= Seq(personalRepo)

In [None]:
// ivy imports
import $ivy.`edu.holycross.shot::latincorpus:7.0.0-pr5`

In [None]:
import edu.holycross.shot.latincorpus._
import scala.io.Source

In [None]:
val hyginusUrl = "https://raw.githubusercontent.com/LinguaLatina/analysis/master/data/hyginus/hyginus-latc.cex"
val hyginus = LatinCorpus.fromUrl(hyginusUrl)

In [None]:
val tokens = hyginus.tokens.filter(_.text.head.isLower)

val total = tokens.size

val gross = hyginus.lexicalTokens.size
val totalAnalyzed = tokens.filter(_.analyses.nonEmpty).size

val analysisCoverage = (totalAnalyzed * 1.0 / total) * 100
val analysisPct = BigDecimal(analysisCoverage).setScale(1, BigDecimal.RoundingMode.HALF_UP).toDouble


In [None]:
// Omit these IDs until parser recompiled
val tempOmit = List(
  "ls.n49983", 
  "ls.n40071",
  "ls.n25107", 
  "ls.n28700",
  "ls.38383", 
  "ls.n40913",
  "ls.n30584", // -ne
  "ls.n31181", // nonne
  "ls.n31181", // num
  "ls.n19471", // genu
  "s.n27977" // some kind of typo
)

def vocabForUnit(vocabUnit: Int): Vector[String] = {
  val vocab = for (i <- 1 to vocabUnit) yield {
    val lines = Source.fromURL(vocabFiles(i))
    val lexemeIds = lines.getLines.toVector.tail.filter(_.nonEmpty).map( ln => {
      val columns = ln.split("#")
      val idParts = columns.head.split(":")
      idParts.head
    })
    lexemeIds
  }
  vocab.toVector.flatten.filterNot(v => tempOmit.contains(v))
}

def unitCoverage(vocabUnit: Int) = {
  val counts = vocabForUnit(vocabUnit).map(lex => hyginus.passagesForLexeme(lex).size)
  val unitCoverage = (counts.sum * 1.0 / total) * 100
  val grossCoverage = (counts.sum * 1.0 / gross) * 100
  val unitPct = BigDecimal(unitCoverage).setScale(1, BigDecimal.RoundingMode.HALF_UP).toDouble
  val grossPct =   BigDecimal(grossCoverage).setScale(1, BigDecimal.RoundingMode.HALF_UP).toDouble
  (unitPct, counts.size, grossPct)
}


In [None]:
def survey = {
  val rows = for (i <- 1 to vocabFiles.size) yield {
    print("Computing coverage of unit " + i + " ... ")
    val (pct, vocabSize, grossPct) = unitCoverage(i)
    println(pct + "%")
    "<tr>"+
    s"<td>Vocabulary through unit ${i} vocabulary</td>" +
    s"<td><strong>${pct}%</strong> (without proper names)</td>" +
    s"<td><strong>${grossPct}%</strong> (all)</td>" +
    s"<td><strong>${vocabSize}</strong> vocabulary items</td>" +
    "</tr>"
  }
  
  val header = "<h2>L3 vocabulary lists: coverage of Hyginus</h2>"
  val tableHeader = "<tr><th>Unit</th><th>Pct. excluding proper names</th><th>Pct. (all)<th>Vocabulary items</th></tr>"
  Html(header + "<table>"  + tableHeader + rows.mkString("\n") + "</table>")
}