In [1]:
# %autosave 600
import numpy as np
import pandas as pd
import random
import spacy
from spacy.util import minibatch, compounding

import thinc.extra.datasets

In [8]:
# set variables
n_texts = 20
tv_split = 0.8
n_iter = 10

# load the model
nlp = spacy.blank('en') 

# add the text classifier to the pipeline
textcat = nlp.create_pipe('textcat')
nlp.add_pipe(textcat, last=True)

# add label to text classifier
textcat.add_label('POSITIVE')

1

In [9]:
# load and preprocess data


def load_data(limit=n_texts, split=tv_split):
    """Load data from the IMDB dataset."""
    # Partition off part of the train data for evaluation
    train_data, _ = thinc.extra.datasets.imdb()
    random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{'POSITIVE': bool(y)} for y in labels]
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])


print("Loading IMDB data...")
(train_texts, train_cats), (dev_texts, dev_cats) = load_data()
print("Using {} examples ({} training, {} evaluation)"
          .format(n_texts, len(train_texts), len(dev_texts)))
train_data = list(zip(train_texts,
                          [{'cats': cats} for cats in train_cats]))





Loading IMDB data...
Using 20 examples (16 training, 4 evaluation)


In [10]:
# train the textcat model

def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 1e-8  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}



optimizer = nlp.begin_training()

print("Training the model...")
print('{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('iter#', 'LOSS', 'P', 'R', 'F'))

for i in range(n_iter):
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(train_data, size=compounding(4., 32., 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                           losses=losses)
    with textcat.model.use_params(optimizer.averages):
        # evaluate on the dev data split off in load_data()
        scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
    print('{0:.0f}\t{1:.3f}\t{2:.3f}\t{3:.3f}\t{4:.3f}'  # print a simple table
                  .format(i+1, losses['textcat'], scores['textcat_p'],
                          scores['textcat_r'], scores['textcat_f']))


Training the model...
iter#	LOSS 	  P  	  R  	  F  
1	1.023	0.000	0.000	0.000
2	0.794	0.500	0.000	0.000
3	0.736	0.500	0.000	0.000
4	0.683	0.000	0.000	0.000
5	0.599	0.000	0.000	0.000
6	0.532	0.333	0.500	0.400
7	0.372	0.333	0.500	0.400
8	0.340	0.000	0.000	0.000
9	0.331	0.000	0.000	0.000
10	0.352	0.333	0.500	0.400


In [11]:
# test the trained model
test_text = "This movie was super great. loved it"
doc = nlp(test_text)
print(test_text, doc.cats)


This movie was super great. loved it {'POSITIVE': 0.44178512692451477}


In [25]:
# train_data
annotations

({'cats': {'POSITIVE': True}},
 {'cats': {'POSITIVE': True}},
 {'cats': {'POSITIVE': False}},
 {'cats': {'POSITIVE': False}})