In [None]:
import sys
!{sys.executable} -m pip install spacy



In [None]:
import spacy
import random
import time
import numpy as np
from spacy.util import minibatch, compounding

In [None]:
from os import path, mkdir
if not path.isdir("data/"):
    mkdir("data/")
if not path.isdir("models/"):
    mkdir("models/")

In [None]:
!curl https://groups.csail.mit.edu/sls/downloads/movie/engtest.bio -o data/test.txt
!curl https://groups.csail.mit.edu/sls/downloads/movie/engtrain.bio -o data/train.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  246k  100  246k    0     0   404k      0 --:--:-- --:--:-- --:--:--  403k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  989k  100  989k    0     0  2312k      0 --:--:-- --:--:-- --:--:-- 2312k


In [None]:
def load_data_spacy(file_path):
    ''' Converts data from:
    label \t word \n label \t word \n \n label \t word
    to: sentence, {entities : [(start, end, label), (stard, end, label)]}
    '''
    file = open(file_path, 'r')
    training_data, entities, sentence, unique_labels = [], [], [], []
    current_annotation = None
    end = 0 # initialize counter to keep track of start and end characters
    for line in file:
        line = line.strip("\n").split("\t")
        # lines with len > 1 are words
        if len(line) > 1:
            label = line[0][2:]     # the .txt is formatted: label \t word, label[0:2] = label_type
            label_type = line[0][0] # beginning of annotations - "B", intermediate - "I"
            word = line[1]
            sentence.append(word)
            end += (len(word) + 1)  # length of the word + trailing space
           
            if label_type != 'I' and current_annotation:  # if at the end of an annotation
                entities.append((start, end - 2 - len(word), current_annotation))  # append the annotation
                current_annotation = None                 # reset the annotation
            if label_type == 'B':                         # if beginning new annotation
                start = end - len(word) - 1  # start annotation at beginning of word
                current_annotation = label   # append the word to the current annotation
            if label_type == 'I':            # if the annotation is multi-word
                current_annotation = label   # append the word
           
            if label != 'O' and label not in unique_labels:
                unique_labels.append(label)
 
        # lines with len == 1 are breaks between sentences
        if len(line) == 1:
            if current_annotation:
                entities.append((start, end - 1, current_annotation))
            sentence = " ".join(sentence)
            training_data.append([sentence, {'entities' : entities}])
            # reset the counters and temporary lists
            end = 0            
            entities, sentence = [], []
            current_annotation = None
    file.close()
    return training_data, unique_labels            
           
TRAIN_DATA, LABELS = load_data_spacy("data/train.txt")

In [None]:
[x[0] for x in TRAIN_DATA[1:10]]

['show me films with drew barrymore from the 1980s',
 'what movies starred both al pacino and robert deniro',
 'find me all of the movies that starred harold ramis and bill murray',
 'find me a movie with a quote about baseball in it',
 'what movies have mississippi in the title',
 'show me science fiction films directed by steven spielberg',
 'do you have any thrillers directed by sofia coppola',
 'what leonard cohen songs have been used in a movie',
 'show me films elvis films set in hawaii']

In [None]:
[x[1] for x in TRAIN_DATA[1:10]]

[{'entities': [(19, 33, 'ACTOR'), (43, 48, 'YEAR')]},
 {'entities': [(25, 34, 'ACTOR'), (39, 52, 'ACTOR')]},
 {'entities': [(39, 51, 'ACTOR'), (56, 67, 'ACTOR')]},
 {'entities': []},
 {'entities': [(17, 28, 'TITLE')]},
 {'entities': [(8, 29, 'GENRE'), (42, 58, 'DIRECTOR')]},
 {'entities': [(16, 25, 'GENRE'), (38, 51, 'DIRECTOR')]},
 {'entities': [(5, 24, 'SONG')]},
 {'entities': [(14, 19, 'ACTOR'), (26, 39, 'PLOT')]}]

In [None]:
!{sys.executable} -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [None]:
from spacy import displacy
import warnings
warnings.filterwarnings("ignore")
nlp = spacy.load('en')
TEST_DATA, _ = load_data_spacy("data/test.txt")

test_sentences = [x[0] for x in TEST_DATA[0:15]] # extract the sentences from [sentence, entity]
for x in test_sentences:
    doc = nlp(x)
    displacy.render(doc, jupyter = True, style = "ent")
warnings.filterwarnings("default")

In [None]:
# A simple decorator to log function processing time
def timer(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        print("Completed in {} seconds".format(int(te - ts)))
        return result
    return timed

In [None]:
# Data must be of the form (sentence, {entities: [start, end, label]})
@timer
def train_spacy(train_data, labels, iterations, dropout = 0.2, display_freq = 1):
    ''' Train a spacy NER model, which can be queried against with test data
   
    train_data : training data in the format of (sentence, {entities: [(start, end, label)]})
    labels : a list of unique annotations
    iterations : number of training iterations
    dropout : dropout proportion for training
    display_freq : number of epochs between logging losses to console
    '''
    nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
   
    # Add entity labels to the NER pipeline
    for i in labels:
        ner.add_label(i)

    # Disable other pipelines in SpaCy to only train NER
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        nlp.vocab.vectors.name = 'spacy_model' # without this, spaCy throws an "unnamed" error
        optimizer = nlp.begin_training()
        for itr in range(iterations):
            random.shuffle(train_data) # shuffle the training data before each iteration
            losses = {}
            batches = minibatch(train_data, size = compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(          
                    texts,
                    annotations,
                    drop = dropout,  
                    sgd = optimizer,
                    losses = losses)
            if itr % display_freq == 0:
                print("Iteration {} Loss: {}".format(itr + 1, losses))
    return nlp

# Train (and save) the NER model
ner = train_spacy(TRAIN_DATA, LABELS,6)
ner.to_disk("models/spacy_example")

Iteration 1 Loss: {'ner': 19233.177551970664}
Iteration 2 Loss: {'ner': 12757.62439867914}
Iteration 3 Loss: {'ner': 10925.221065744825}
Iteration 4 Loss: {'ner': 9658.275894171195}
Iteration 5 Loss: {'ner': 9083.952081262943}
Iteration 6 Loss: {'ner': 8434.996402300636}
Completed in 248 seconds


  srsly.json_dumps(self.meta)
  writer(path / key)


In [None]:
from spacy import displacy

def load_model(model_path):
    ''' Loads a pre-trained model for prediction on new test sentences
   
    model_path : directory of model saved by spacy.to_disk
    '''
    nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    ner = nlp.from_disk(model_path)
    return ner

ner = load_model("models/spacy_example")

TEST_DATA, _ = load_data_spacy("data/test.txt")

test_sentences = [x[0] for x in TEST_DATA[0:15]] # extract the sentences from [sentence, entity]
for x in test_sentences:
    doc = ner(x)
    displacy.render(doc, jupyter = True, style = "ent")