### Outline
#### Goal is to train spaCy NER from litbank data 
✅ Load annotation data from LitBank  
✅ Create train and validation sets  
✅ Train NER from scratch using only language object  
✅ Assess results for various approaches  
✅ Where do we see improvement? When is the model sufficiently useful in research? 


In [2]:
!pip install spacy sklearn tqdm
!git clone https://github.com/dbamman/litbank.git
import spacy 
print(f'Using spaCy version {spacy.__version__}')

Cloning into 'litbank'...
remote: Enumerating objects: 1179, done.[K
remote: Counting objects: 100% (123/123), done.[K
remote: Compressing objects: 100% (118/118), done.[K
remote: Total 1179 (delta 12), reused 104 (delta 5), pack-reused 1056[K
Receiving objects: 100% (1179/1179), 40.71 MiB | 4.24 MiB/s, done.
Resolving deltas: 100% (129/129), done.
Updating files: 100% (1423/1423), done.
Using spaCy version 3.2.1


In [4]:
from pathlib import Path
entities_path = Path.cwd() / 'litbank' / 'entities' / 'brat'

text_files = [f for f in entities_path.iterdir() if f.suffix == '.txt']
assert len(text_files) == 100
print(f'[*] imported {len(text_files)} files')

[*] imported 100 files


In [5]:
# for each file, create a Doc object and add the annotation data to doc.ents
# our output is a list of Doc objects 
import spacy 
from tqdm.notebook import tqdm
from spacy.tokens import Span, DocBin
from spacy.util import filter_spans


docs = []

#note: not using pretrained model because it adds predictions, just want LitBank data
nlp = spacy.blank("en")
nlp.add_pipe('sentencizer') # used in training assessment


for text_file in tqdm(text_files):
    doc = nlp(text_file.read_text())
    annotation_file = (entities_path / (text_file.stem +'.ann'))
    annotations = annotation_file.read_text().split('\n')
    ents = []
    for annotation in annotations[:-1]:
        label, start, end = annotation.split('\t')[1].split()
        span = doc.char_span(int(start), int(end), label=label)
        if span: # when start and end do not match a valid string, spaCy returns a NoneType span
            ents.append(span)
    
    filtered = filter_spans(ents)
    doc.ents = filtered
    docs.append(doc)
    

assert len(docs) == 100

  0%|          | 0/100 [00:00<?, ?it/s]

In [6]:
# Split the data into sets for training and validation 
from sklearn.model_selection import train_test_split

train_set, validation_set = train_test_split(docs, test_size=0.1)
print(f'Created {len(train_set)} training docs')
print(f'Created {len(validation_set)} validation docs')

Created 90 training docs
Created 10 validation docs


In [7]:
# Add training Docs to DocBin and store to disk
from spacy.tokens import DocBin

# the DocBin will store the training documents
train_db = DocBin()
for doc in train_set:
    train_db.add(doc)
train_db.to_disk("./train.spacy")

In [8]:
# Save the validation Docs to disk 
validation_db = DocBin()
for doc in validation_set:
    validation_db.add(doc)
    
validation_db.to_disk("./dev.spacy") # the spaCy doc refer to development data rather than validation, change our language?

In [9]:
!ls -al train.spacy dev.spacy

-rw-r--r-- 1 root root  166113 Dec 16 19:36 dev.spacy
-rw-r--r-- 1 root root 1406011 Dec 16 19:36 train.spacy


In [10]:
!python3 -m spacy init config ./config.cfg --lang en --pipeline ner

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [11]:
# inspect the new config.cfg file 
!cat cpu_config.cfg
# or %load config.cfg (but cell becomes very long)


cat: cpu_config.cfg: No such file or directory


In [13]:
%time 
!python3 -m spacy train config.cfg --output ./output --paths.train train.spacy --paths.dev dev.spacy

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 31.5 µs
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[2021-12-16 20:12:56,934] [INFO] Set up nlp object from config
[2021-12-16 20:12:56,957] [INFO] Pipeline: ['tok2vec', 'ner']
[2021-12-16 20:12:56,959] [INFO] Created vocabulary
[2021-12-16 20:12:56,960] [INFO] Finished initializing nlp object
[2021-12-16 20:13:02,502] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00   1138.38    0.00    0.00    0.00    0.00
^C

Aborted!


In [19]:
# View the predictions of our new model
import random
from spacy import displacy 

new_nlp = spacy.load("output/model-last")
val_doc = random.choice(validation_set)
doc = new_nlp(val_doc.text)

displacy.render(doc[:100], jupyter=True, style="ent")

In [20]:
# Compare against the original LitBank annotations 
displacy.render(val_doc[:100], jupyter=True, style="ent")

In [35]:
#https://spacy.io/api/language#evaluate
from spacy.training import Example

examples =[]
for val_doc in train_set + validation_set:
    new_doc = new_nlp(val_doc.text)
    examples.append(Example(new_doc, val_doc))

In [36]:
scores = nlp.evaluate(examples)
print(scores)
print(len(examples))


{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'sents_p': 1.0, 'sents_r': 1.0, 'sents_f': 1.0, 'speed': 121138.01940589753}
100
