In [1]:
%autosave 600
import numpy as np
import pandas as pd
import random
import cupy
import spacy
import chainer
from spacy.util import minibatch, compounding
from sklearn.metrics import roc_auc_score

import thinc.neural.gpu_ops

Autosaving every 600 seconds


In [2]:
# load data
train = pd.read_csv('../input/train.csv')
trainX = train['comment_text'].values.tolist()

# set variables
n_texts = 20 #train.shape[0]
tv_split = 0.85
n_iter = 4

class_names = list(train)[-6:]
tox_type = 'toxic'


# make train and val
def load_and_split(tox_type):
    train[tox_type] = pd.to_numeric(train[tox_type])
    target = train[tox_type].values.tolist() 
    train_data = list(zip(trainX, target))
    train_data = train_data[-n_texts:]

    texts, labels = zip(*train_data)
    cats = [{tox_type: bool(y)} for y in labels]
    split = int(len(train_data) * tv_split)

    train_texts = texts[:split]
    train_cats = cats[:split]
    dev_texts = texts[split:]
    dev_cats = cats[split:]

    train_data = list(zip(train_texts,
                          [{'cats': cats} for cats in train_cats]))
    
    print("Using {} examples ({} training, {} evaluation)"
       .format(len(texts), len(train_texts), len(dev_texts)))
 
    return train_data, (dev_texts, dev_cats)


In [3]:




def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 1e-8  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}
  

In [4]:
def main():
    train_data, (dev_texts, dev_cats) = load_and_split(tox_type)

    nlp = spacy.blank('en')
    clf = nlp.create_pipe('textcat')
    nlp.add_pipe(clf, last=True)
    clf.add_label(tox_type)
    
 

    optimizer = nlp.begin_training()
    print("Training the model...")
    print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))

    for i in range(n_iter):
        losses = {}

        batches = minibatch(train_data, size=compounding(4., 16., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)

        with clf.model.use_params(optimizer.averages):
            scores = evaluate(nlp.tokenizer, clf, dev_texts, dev_cats)

        print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
                      .format(losses['textcat'], scores['textcat_p'],
                              scores['textcat_r'], scores['textcat_f']))

In [5]:
main()

Using 20 examples (17 training, 3 evaluation)
Training the model...
LOSS 	  P  	  R  	  F  
1.193	0.500	0.500	0.500
0.241	0.500	0.500	0.500
0.249	0.500	0.500	0.500
0.246	0.500	0.500	0.500


In [6]:
# test the trained model
test_text = " i must kill you !"
doc = nlp(test_text)
print(test_text, doc.cats)


NameError: name 'nlp' is not defined

In [None]:
# save model
nlp.to_disk("../subs")