### Outline
#### Goal is to train spaCy NER from litbank data 
✅ Load annotation data from LitBank  
✅ Create train and validation sets  
✅ Train NER from scratch using only language object  
✅ Assess results for various approaches  
✅ Where do we see improvement? When is the model sufficiently useful in research? 


In [1]:
!pip install spacy sklearn tqdm
!git clone https://github.com/dbamman/litbank.git
import spacy 
print(f'Using spaCy version {spacy.__version__}')

fatal: destination path 'litbank' already exists and is not an empty directory.
Using spaCy version 3.2.1


In [15]:
from pathlib import Path
entities_path = Path.cwd() / 'litbank' / 'entities' / 'brat'

text_files = [f for f in entities_path.iterdir() if f.suffix == '.txt']
assert len(text_files) == 100
print(f'[*] imported {len(text_files)} files')

[*] imported 100 files


In [16]:
# for each file, create a Doc object and add the annotation data to doc.ents
# our output is a list of Doc objects 
import spacy 
from tqdm.notebook import tqdm
from spacy.tokens import Span, DocBin
from spacy.util import filter_spans


docs = []

#note: not using pretrained model because it adds predictions, just want LitBank data
nlp = spacy.blank("en")
nlp.add_pipe('sentencizer') # used in training assessment


for text_file in tqdm(text_files):
    doc = nlp(text_file.read_text())
    annotation_file = (entities_path / (text_file.stem +'.ann'))
    annotations = annotation_file.read_text().split('\n')
    ents = []
    for annotation in annotations[:-1]:
        label, start, end = annotation.split('\t')[1].split()
        span = doc.char_span(int(start), int(end), label=label)
        if span: # when start and end do not match a valid string, spaCy returns a NoneType span
            ents.append(span)
    
    filtered = filter_spans(ents)
    doc.ents = filtered
    docs.append(doc)
    

assert len(docs) == 100

  0%|          | 0/100 [00:00<?, ?it/s]

In [17]:
# Split the data into sets for training and validation 
from sklearn.model_selection import train_test_split

train_set, validation_set = train_test_split(docs, test_size=0.1)
print(f'Created {len(train_set)} training docs')
print(f'Created {len(validation_set)} validation docs')

Created 90 training docs
Created 10 validation docs


In [18]:
# Add training Docs to DocBin and store to disk
from spacy.tokens import DocBin

# the DocBin will store the training documents
train_db = DocBin()
for doc in train_set:
    train_db.add(doc)
train_db.to_disk("./train.spacy")

In [19]:
# Save the validation Docs to disk 
validation_db = DocBin()
for doc in validation_set:
    validation_db.add(doc)
    
validation_db.to_disk("./dev.spacy") # the spaCy doc refer to development data rather than validation, change our language?

In [20]:
!ls -al train.spacy dev.spacy

-rw-rw-r-- 1 ds ds  166663 Dec 16 13:09 dev.spacy
-rw-rw-r-- 1 ds ds 1404946 Dec 16 13:09 train.spacy


In [22]:
!python3 -m spacy init config ./config.cfg --lang en --pipeline ner

[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [23]:
# inspect the new config.cfg file 
!cat cpu_config.cfg
# or %load config.cfg (but cell becomes very long)


[paths]
train = null
dev = null
vectors = null
init_tok2vec = null

[system]
gpu_allocator = null
seed = 0

[nlp]
lang = "en"
pipeline = ["tok2vec","ner"]
batch_size = 1000
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}

[components]

[components.ner]
factory = "ner"
incorrect_spans_key = null
moves = null
scorer = {"@scorers":"spacy.ner_scorer.v1"}
update_with_oracle_cut_size = 100

[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = true
nO = null

[components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
upstream = "*"

[components.tok2vec]
factory = "tok2vec"

[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"

[components.tok2vec.model.embed]
@architec

In [24]:
%time 
!python3 -m spacy train config.cfg --output ./output --paths.train train.spacy --paths.dev dev.spacy

CPU times: user 9 µs, sys: 2 µs, total: 11 µs
Wall time: 26 µs
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2021-12-16 13:09:19,627] [INFO] Set up nlp object from config
[2021-12-16 13:09:19,636] [INFO] Pipeline: ['tok2vec', 'ner']
[2021-12-16 13:09:19,640] [INFO] Created vocabulary
[2021-12-16 13:09:19,640] [INFO] Finished initializing nlp object
[2021-12-16 13:09:24,163] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00   1141.08    0.00    0.00    0.00    0.00
  2     200      11347.67  56263.32   49.85   60.83   42.23    0.50
  4     400      20383.62  30445.49   52.99   57.12   49.42    0.53
  6     600      10084.62  20965.22   49.98   61.37   42.15 

In [34]:
# View the predictions of our new model
import random
from spacy import displacy 

new_nlp = spacy.load("output/model-last")
val_doc = random.choice(validation_set)
doc = new_nlp(val_doc.text)

displacy.render(doc[:100], jupyter=True, style="ent")

In [35]:
# Compare against the original LitBank annotations 
displacy.render(val_doc[:100], jupyter=True, style="ent")