In [1]:
%autosave 600
import numpy as np
import pandas as pd
import random
import cupy
import spacy
import chainer
from spacy.util import minibatch, compounding
from sklearn.metrics import roc_auc_score

import thinc.neural.gpu_ops

Autosaving every 600 seconds


In [2]:
# load data
train = pd.read_csv('../input/train.csv')
trainX = train['comment_text'].values.tolist()

# set variables
n_texts = 50
tv_split = 0.8
n_iter = 2

class_names = list(train)[-6:]
tox_type = 'threat'


# make train and val
def load_and_split(tox_type):
    train[tox_type] = pd.to_numeric(train[tox_type])
    target = train[tox_type].values.tolist() 
    train_data = list(zip(trainX, target))
    train_data = train_data[-n_texts:]

    texts, labels = zip(*train_data)
    cats = [{tox_type: bool(y)} for y in labels]
    split = int(len(train_data) * tv_split)

    train_texts = texts[:split]
    train_cats = cats[:split]
    dev_texts = texts[split:]
    dev_cats = cats[split:]

    print("Using {} examples ({} training, {} evaluation)"
       .format(len(texts), len(train_texts), len(dev_texts)))
 
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])


In [11]:

# def create_model(tox_type):

def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 1e-8  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}



In [12]:
def main():
    (train_texts, train_cats), (dev_texts, dev_cats) = load_and_split(tox_type)
    train_data = list(zip(train_texts,
                          [{'cats': cats} for cats in train_cats]))

    nlp = spacy.blank('en')
    textcat = nlp.create_pipe('textcat')
    nlp.add_pipe(textcat, last=True)
    textcat.add_label(tox_type)





    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']

    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        print("Training the model...")
        print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))

        for i in range(n_iter):
            losses = {}

            batches = minibatch(train_data, size=compounding(4., 16., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)

            with textcat.model.use_params(optimizer.averages):
                  scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
#                 scores = evaluate(2)
            print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
                          .format(losses['textcat'], scores['textcat_p'],
                                  scores['textcat_r'], scores['textcat_f']))

In [13]:
main()

Using 50 examples (40 training, 10 evaluation)
Training the model...
LOSS 	  P  	  R  	  F  
0.449	0.500	0.500	0.500
0.009	0.500	0.500	0.500


In [None]:
# test the trained model
test_text = " i must kill you !"
doc = nlp(test_text)
print(test_text, doc.cats)


In [None]:
# save model
nlp.to_disk("../subs")