In [1]:
%autosave 600
import numpy as np
import pandas as pd
import random
import cupy
import spacy
import chainer
from spacy.util import minibatch, compounding

import thinc.neural.gpu_ops

Autosaving every 600 seconds


In [18]:
# load data
train = pd.read_csv('input/train.csv')
trainX = train['comment_text'].values.tolist()

# set variables
n_texts = 100
tv_split = 0.8
n_iter = 8
class_names = list(train)[-6:]
tox_type = 'threat'



In [21]:

def create_model(tox_type):
  nlp = spacy.blank('en')
  clf = nlp.create_pipe('textcat')
  nlp.add_pipe(clf, last=True)
  clf.add_label(tox_type)
  optimizer = nlp.begin_training()
  return nlp, clf, optimizer


def preprocess(tox_type):
  train[tox_type] = pd.to_numeric(train[tox_type])
  target = train[tox_type].values.tolist()

  train_data = list(zip(trainX, target))
  random.shuffle(train_data)
  train_data = train_data[-n_texts:]

  texts, labels = zip(*train_data)
  cats = [{tox_type: bool(y)} for y in labels]
  split = int(len(train_data) * tv_split)

  train_texts = texts[:split]
  train_cats = cats[:split]
  dev_texts = texts[split:]
  dev_cats = cats[split:]

  print("Using {} examples ({} training, {} evaluation)"
      .format(len(texts), len(train_texts), len(dev_texts)))
  train_data = list(zip(train_texts,
                      [{'cats': cats} for cats in train_cats]))
  return train_data, dev_texts, dev_cats


def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 1e-8  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}

In [30]:
for cl in class_names:
  train_data, dev_texts, dev_cats = preprocess(cl)
  nlp, clf, optimizer = create_model(cl)

  print("Training the model...")
  print('{:^5}\t{:^5}\t{:^5}'.format('iter#', 'LOSS', 'ROC'))

  for i in range(n_iter):
    losses = {}
    
    batches = minibatch(train_data, size=compounding(4., 16., 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
    
    with clf.model.use_params(optimizer.averages):
        scores = evaluate(nlp.tokenizer, clf, dev_texts, dev_cats)
    
    print('{0:.0f}\t{1:.3f}\t{2:.3f}'.format(i+1, losses['textcat'], scores['textcat_p']))


Using 100 examples (80 training, 20 evaluation)
Training the model...
iter#	LOSS 	 ROC 
1	2.868	0.500
2	2.116	0.500
3	1.898	0.500
4	1.521	0.500
5	1.277	0.500
6	0.738	0.500
7	0.469	0.500
8	0.248	0.500
Using 100 examples (80 training, 20 evaluation)
Training the model...
iter#	LOSS 	 ROC 
1	1.228	0.500
2	0.026	0.500
3	0.005	0.500
4	0.003	0.500
5	0.002	0.500
6	0.002	0.500
7	0.003	0.500
8	0.001	0.500
Using 100 examples (80 training, 20 evaluation)
Training the model...
iter#	LOSS 	 ROC 
1	1.256	0.500
2	0.751	0.500
3	0.718	0.500
4	0.660	0.500
5	0.688	1.000
6	0.500	1.000
7	0.502	1.000
8	0.536	1.000
Using 100 examples (80 training, 20 evaluation)
Training the model...
iter#	LOSS 	 ROC 
1	3.050	0.500
2	0.585	0.500
3	0.171	0.500
4	0.212	0.500
5	0.206	0.500
6	0.120	0.500
7	0.052	0.500
8	0.060	0.500
Using 100 examples (80 training, 20 evaluation)
Training the model...
iter#	LOSS 	 ROC 


KeyboardInterrupt: 

In [None]:
# test the trained model
test_text = " i must kill you !"
doc = nlp(test_text)
print(test_text, doc.cats)


In [None]:
losses['textcat']

In [None]:
len(train_data)

In [12]:
doc.cat.items

NameError: name 'doc' is not defined

In [14]:
a = clf.pipe(docs)

NameError: name 'docs' is not defined