# Tracking works cited in the _Dai Nihon Shi_
We have a selection of chapters from the _Dai Nihon Shi_ that have been manually annotated with named entities, including people (`PER`), locations (`LOC`), and works of art (`WORK_OF_ART`). Using this selection, we want to identify some of the most frequently mentioned works of art (nearly always written works) and track their appearance across the entire _Dai Nihon Shi_.

First, let's find all the works of art (tagged `WORK_OF_ART`) in our annotated data, which is stored in the CoNLL-2002 (`.conll`) format.

In [9]:
from pathlib import Path

import spacy

# load all annotated data exported from INCEpTION; make a Doc out of each chapter
docs = []
nlp = spacy.blank("lzh")
for conll_file in Path("../assets/kanbun/3_inception_export/").glob("*.conll"):
  with open(conll_file, "r") as f:
    conll_doc = f.read().strip()
    words = []
    sent_starts = []
    pos_tags = []
    biluo_tags = []
    for conll_sent in conll_doc.split("\n\n"):
        conll_sent = conll_sent.strip()
        if not conll_sent:
            continue
        lines = [line.strip() for line in conll_sent.split("\n") if line.strip()]
        cols = list(zip(*[line.split() for line in lines]))
        length = len(cols[0])
        words.extend(cols[0])
        sent_starts.extend([True] + [False] * (length - 1))
        biluo_tags.extend(spacy.training.iob_utils.iob_to_biluo(cols[-1]))
        pos_tags.extend(cols[1] if len(cols) > 2 else ["-"] * length)
    doc = spacy.tokens.Doc(
      nlp.vocab,
      words=words,
      spaces=[False] * len(words),
    )
    for i, token in enumerate(doc):
      token.tag_ = pos_tags[i]
      token.is_sent_start = sent_starts[i]
    entities = spacy.training.iob_utils.tags_to_entities(biluo_tags)
    doc.ents = [spacy.tokens.Span(doc, start=s, end=e + 1, label=L) for L, s, e in entities]
    docs.append(doc)

# get the entities (people, places, works) from each chapter
entities = {
  "people": [],
  "places": [],
  "works": [],
}
for doc in docs:
  for ent in doc.ents:
    if ent.label_ == "PERSON":
      entities["people"].append(ent)
    elif ent.label_ == "GPE":
      entities["places"].append(ent)
    elif ent.label_ == "WORK_OF_ART":
      entities["works"].append(ent)

# check some of the annotated works
print([ent.text for ent in entities["works"][:5]])

['日本紀略', '三代實錄', '日本起略', '一代要記', '三代實錄']
