# Textbook coverage of vocabulary in Hyginus, *Fabulae*

Read vocabulary lists for L3 textbook, compute percentage of total tokens in Hyginus (excluding proper names) covered by accumulated vocabulary for each unit.

# Step 2: display coverage

In [36]:
survey

Computing coverage of unit 1... 29.3%
Computing coverage of unit 2... 40.8%
Computing coverage of unit 3... 58.2%
Computing coverage of unit 4... 62.9%


Unit,Pct. Hyginus,Vocabulary items
Vocabulary through unit 1 vocabulary,covers 29.3% of tokens in Hyginus,35 vocabulary items
Vocabulary through unit 2 vocabulary,covers 40.8% of tokens in Hyginus,69 vocabulary items
Vocabulary through unit 3 vocabulary,covers 58.2% of tokens in Hyginus,109 vocabulary items
Vocabulary through unit 4 vocabulary,covers 62.9% of tokens in Hyginus,144 vocabulary items


# Step 1: load everything


In [None]:
// set up notebook to find repository
val personalRepo = coursierapi.MavenRepository.of("https://dl.bintray.com/neelsmith/maven")
interp.repositories() ++= Seq(personalRepo)

In [None]:
// ivy imports
import $ivy.`edu.holycross.shot::latincorpus:7.0.0-pr5`

In [None]:
import edu.holycross.shot.latincorpus._
import scala.io.Source

In [None]:
val hyginusUrl = "https://raw.githubusercontent.com/LinguaLatina/analysis/master/data/hyginus/hyginus-latc.cex"
val hyginus = LatinCorpus.fromUrl(hyginusUrl)

In [None]:
val tokens = hyginus.tokens.filter(_.text.head.isLower)

val total = tokens.size
val totalAnalyzed = tokens.filter(_.analyses.nonEmpty).size

val analysisCoverage = (totalAnalyzed * 1.0 / total) * 100
val analysisPct = BigDecimal(analysisCoverage).setScale(1, BigDecimal.RoundingMode.HALF_UP).toDouble


In [None]:
val vocabFiles : Map[Int, String] = Map(
  1 -> "https://raw.githubusercontent.com/LinguaLatina/textbook/master/vocablists/01-nouns-adjs-pron.cex",
  2 -> "https://raw.githubusercontent.com/LinguaLatina/textbook/master/vocablists/02-verbs.cex",
  3 -> "https://raw.githubusercontent.com/LinguaLatina/textbook/master/vocablists/03-place-and-time.cex",
  4 -> "https://raw.githubusercontent.com/LinguaLatina/textbook/master/vocablists/04-verbal-nouns-and-adjectives.cex"
)


In [31]:
// Omit these IDs until parser recompiled
val tempOmit = List(
  "ls.n49983", 
  "ls.n40071",
  "ls.n25107", 
  "ls.n28700",
  "ls.38383", 
  "ls.n40913"
)

def vocabForUnit(vocabUnit: Int): Vector[String] = {
  val vocab = for (i <- 1 to vocabUnit) yield {
    val lines = Source.fromURL(vocabFiles(i))
    val lexemeIds = lines.getLines.toVector.tail.filter(_.nonEmpty).map( ln => {
      val columns = ln.split("#")
      val idParts = columns.head.split(":")
      idParts.head
    })
    lexemeIds
  }
  vocab.toVector.flatten.filterNot(v => tempOmit.contains(v))
}

def unitCoverage(vocabUnit: Int) = {
  val counts = vocabForUnit(vocabUnit).map(lex => hyginus.passagesForLexeme(lex).size)
  val unitCoverage = (counts.sum * 1.0 / total) * 100
  val unitPct = BigDecimal(unitCoverage).setScale(1, BigDecimal.RoundingMode.HALF_UP).toDouble
  (unitPct, counts.size)
}


[36mtempOmit[39m: [32mList[39m[[32mString[39m] = [33mList[39m(
  [32m"ls.n49983"[39m,
  [32m"ls.n40071"[39m,
  [32m"ls.n25107"[39m,
  [32m"ls.n28700"[39m,
  [32m"ls.38383"[39m,
  [32m"ls.n40913"[39m
)
defined [32mfunction[39m [36mvocabForUnit[39m
defined [32mfunction[39m [36munitCoverage[39m

In [35]:
def survey = {
  val rows = for (i <- 1 to 4) yield {
    print("Computing coverage of unit " + i + "... ")
    val (pct, vocabSize) = unitCoverage(i)
    println(pct + "%")
    "<tr>"+
    s"<td>Vocabulary through unit ${i} vocabulary</td>" +
    s"<td>covers <strong>${pct}%</strong> of tokens in Hyginus</td>" +
    s"<td><strong>${vocabSize}</strong> vocabulary items</td>" +
    "</tr>"
  }
  
  val header = "<h2>L3 vocabulary lists: coverage of Hyginus</h2>"
  val tableHeader = "<tr><th>Unit</th><th>Pct. Hyginus</th><th>Vocabulary items</th></tr>"
  Html(header + "<table>"  + tableHeader + rows.mkString("\n") + "</table>")
}

defined [32mfunction[39m [36msurvey[39m