In [8]:
import numpy as np
import pandas as pd
import spacy as sp
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedShuffleSplit

In [9]:
# load data
train = pd.read_csv('../input/train.csv')
trainX = train['comment_text'].values.tolist()

# set variables
n_texts = train.shape[0]
train_frac = 0.85
n_iter = 4

split = int(train.shape[0] * train_frac)
pred_len = train.shape[0] - split

class_names = list(train)[-6:]

# make train and val
def load_and_split(tox_type):
    train[tox_type] = pd.to_numeric(train[tox_type])
    target = train[tox_type].values.tolist() 
    train_data = list(zip(trainX, target))
    train_data = train_data[-n_texts:]

    texts, labels = zip(*train_data)
    cats = [{tox_type: bool(y)} for y in labels]
   
    
    train_texts = texts[:split]
    train_cats = cats[:split]
    dev_texts = texts[split:]
    dev_cats = cats[split:]  

    train_data = list(zip(train_texts,
                          [{'cats': cats} for cats in train_cats]))
    
    print("Using {} examples ({} training, {} evaluation)"
       .format(len(texts), len(train_texts), len(dev_texts)))
 
    return train_data, dev_texts, dev_cats


In [10]:
def evaluate(tox_type, dev_texts, dev_cats, nlp):
    dev_preds = np.zeros(len(dev_texts))
    for i, t in enumerate(dev_texts):
        doc = nlp(t)
        dev_preds[i] = np.fromiter(doc.cats.values(), dtype=float)

    truths = np.array([d[tox_type] for d in dev_cats])*1
    score = roc_auc_score(truths, dev_preds)
    return score, dev_preds

In [11]:
pred_array = np.zeros((len(class_names), pred_len))

for i, tox_type in enumerate(class_names):
    train_data, dev_texts, dev_cats = load_and_split(tox_type)

    nlp = spacy.blank('en')
    clf = nlp.create_pipe('textcat')
    nlp.add_pipe(clf, last=True)
    clf.add_label(tox_type)

    optimizer = nlp.begin_training()
    print(tox_type)
    print("Training the model...")
    print('{}'.format('AUC'))

    pred_list = []
    for iter in range(n_iter):
        losses = {}

        batches = sp.util.minibatch(train_data, size=sp.util.compounding(4., 16., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)

        with clf.model.use_params(optimizer.averages):
    #         scores = evaluate(nlp.tokenizer, clf, dev_texts, dev_cats)
            score, preds = evaluate(tox_type, dev_texts, dev_cats, nlp)  


        print(score)

    pred_array[i] = preds



Using 159571 examples (135635 training, 23936 evaluation)
toxic
Training the model...
AUC


KeyboardInterrupt: 

In [6]:
pred_len = train.shape[0] - split
pred_len

23936

In [None]:
test = pd.read_csv('../input/test.csv')
testX = test['comment_text'].values.tolist()[10]

test_cats = np.zeros(test.shape[0])
for i, t in enumerate(dev_texts[0:10]):
    doc = nlp(t)
#     test_cats[i] = list(doc.cats)
#     print(list(doc.cats.values()))
    
    test_cats[i] = np.fromiter(doc.cats.values(), dtype=float)
print(test_cats)

In [None]:
train[tox_type] = pd.to_numeric(train[tox_type])
target = train[tox_type].values.tolist() 
train_data = list(zip(trainX, target))
train_data = train_data[-n_texts:]

texts, labels = zip(*train_data)
cats = [{tox_type: bool(y)} for y in labels]


In [None]:

sss = StratifiedShuffleSplit(train_size=train_frac, random_state=42)
for train_index, val_index in sss.split(np.zeros(n_texts), labels):
    print("TRAIN:", train_index, "TEST:", val_index)
#     train_texts, dev_texts = texts[train_index], texts[val_index]
    


train_data = list(zip(train_texts,
                      [{'cats': cats} for cats in train_cats]))

print("Using {} examples ({} training, {} evaluation)"
   .format(len(texts), len(train_texts), len(dev_texts)))

In [None]:
pred_array = np.zeros((len(class_names), len(dev_texts)))

In [None]:
pred_array.shape