### Outline
#### Goal is to train spaCy NER from litbank data 

- Load annotation data from LitBank
- Create train and validation sets
- Identify entities in text using Matcher (note missed ents in val set, not learning, just matching)
- Train NER from scratch using only language object
- Train NER from scratch  for small en model  
- Fine-tune existing NER pipeline
- Assess results for various approaches 
- Where do we see improvement? When is the model sufficiently useful in research?


In [None]:
!pip install spacy sklearn tqdm
!python3 -m spacy download en_core_web_sm
!git clone https://github.com/dbamman/litbank.git

In [None]:
from pathlib import Path
entities_path = Path.cwd() / 'litbank' / 'entities' / 'brat'

text_files = [f for f in entities_path.iterdir() if f.suffix == '.txt']
assert len(text_files) == 100

In [None]:
# for each file, create a Doc object and add the annotation data to doc.ents
# our output is a list of Doc objects 
import spacy 
from tqdm.notebook import tqdm
from spacy.tokens import Span, DocBin
from spacy.util import filter_spans


docs = []

nlp = spacy.load("en_core_web_sm")

#TODO if using pretrained model, it adds predictions, need EntityRecognizer instead

for text_file in tqdm(text_files):
    doc = nlp(text_file.read_text())
    annotation_file = (entities_path / (text_file.stem +'.ann'))
    annotations = annotation_file.read_text().split('\n')
    lit_ents = []
    for annotation in annotations[:-1]:
        label, start, end = annotation.split('\t')[1].split()
        span = doc.char_span(int(start), int(end), label=label)
        lit_ents.append(span)
        # when start and end do not match a valid string, spaCy returns a NoneType span
        lit_ents = [e for e in lit_ents if e] # remove NoneType spans from lit_ents
        filtered = filter_spans(lit_ents)
        doc.set_ents(filtered)
    docs.append(doc)
    
assert len(docs) == 100

In [None]:
# Split the Docs into sets for training and validation 
from sklearn.model_selection import train_test_split

train_docs, validation_docs = train_test_split(docs, test_size=0.2)
print(f'Created {len(train_docs)} training docs')
print(f'Created {len(validation_docs)} validation docs')

In [None]:
# write Docs to disk as serialized binary files
from spacy.tokens import DocBin
test_docs = DocBin()
for tdoc in train_docs: 
    test_docs.add(tdoc)
Path('test_data.spacy').write_bytes(test_docs.to_bytes())

val_docs = DocBin()
for vdoc in validation_docs: 
    val_docs.add(vdoc)
Path('validation_data.spacy').write_bytes(val_docs.to_bytes())

In [None]:
!ls -al test_data.spacy validation_data.spacy

In [None]:
!python3 -m spacy init config ./config.cfg --lang en --pipeline ner

In [None]:
# inspect the new config.cfg file 
!cat config.cfg

In [None]:
#!python3 -m spacy train ./config.cfg --output ./output --paths.train train.spacy --paths.dev dev.spacy