In [1]:
import numpy as np
import pandas as pd
import spacy as sp

In [2]:
# get data
train = pd.read_csv('../input/train.csv')
trainX = train['comment_text'].str.replace('[^\w\s]','')
trainX = trainX.str.lower().values.tolist()

test = pd.read_csv('../input/test.csv')
testX = test['comment_text'].str.replace('[^\w\s]','')
testX = testX.str.lower().values.tolist()

class_names = list(train)[-6:]



In [3]:
# create format for model
all_cats = []
for i in range(len(trainX)):
    catlist = {}
    for tox_type in class_names:
        target = train.loc[i, tox_type] 
        cat = {tox_type: bool(target)}
        catlist.update(cat)
    cats = {'cats': catlist}
    all_cats.append(cats)
train_data = list(zip(trainX, all_cats))

In [4]:
# start model
nlp = sp.blank('en')
clf = nlp.create_pipe('textcat')
nlp.add_pipe(clf, last=True)

for tox_type in class_names:
    clf.add_label(tox_type)
    
optimizer = nlp.begin_training()
print("Training the model...")

        
n_iter = 6    
for iter in range(n_iter):
    losses = {}
    batches = sp.util.minibatch(train_data, size=sp.util.compounding(4., 16., 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
    print("Iteration {} of {} complete.".format(iter+1, n_iter))


    

Training the model...
Iteration 1 of 6 complete.
Iteration 2 of 6 complete.
Iteration 3 of 6 complete.
Iteration 4 of 6 complete.
Iteration 5 of 6 complete.
Iteration 6 of 6 complete.


In [15]:
test_preds = np.zeros((len(testX), len(class_names)))
batches = sp.util.minibatch(testX)
for batch in batches
    for i, txt in enumerate(batch):
        doc = nlp(txt)
        temp = np.fromiter(doc.cats.values(), dtype=float)
        test_preds[i] = (temp)

IndexError: Out of bounds on buffer access (axis 0)

In [16]:
test_preds = []
for i, txt in enumerate(testX):
    doc = nlp(txt)
    temp = np.fromiter(doc.cats.values(), dtype=float)
    test_preds.append(temp)

IndexError: Out of bounds on buffer access (axis 0)

In [None]:
pred_df = pd.DataFrame(test_preds, columns = class_names)

In [None]:
submission = pd.read_csv("../input/sample_submission.csv")
submission[class_names] = pred_df
submission.head()
submission.to_csv("../subs/sub_spacy2.csv", index=False)

In [10]:
len(testX)

153164

In [18]:
# test the trained model
test_text = " i must kill you jew boy!"
doc = nlp(test_text)
print(test_text, doc.cats)

 i must kill you jew boy! {'toxic': 0.8189160823822021, 'severe_toxic': 0.10976621508598328, 'obscene': 0.12333100289106369, 'threat': 0.018847690895199776, 'insult': 0.3530437648296356, 'identity_hate': 0.641996443271637}


In [21]:
test_preds = []
for i, txt in enumerate(testX[0]):
    doc = nlp(txt)
    print(doc.cats.values())
#     test_preds.append(temp)

dict_values([0.11480710655450821, 0.00435399217531085, 0.015478072687983513, 4.539787187241018e-05, 0.028476133942604065, 0.0034778141416609287])
dict_values([0.11786646395921707, 0.012342091649770737, 0.0225022304803133, 4.539787187241018e-05, 0.03252966329455376, 0.005241862032562494])
dict_values([0.03890296816825867, 0.0016323646996170282, 0.0030763011891394854, 4.539787187241018e-05, 0.00674314284697175, 0.0012302856193855405])
dict_values([0.0864296481013298, 0.007888400927186012, 0.014353757724165916, 4.539787187241018e-05, 0.029867585748434067, 0.004056697245687246])
dict_values([0.042233679443597794, 0.0018844197038561106, 0.004467296414077282, 4.539787187241018e-05, 0.008082793094217777, 0.0013656046940013766])
dict_values([0.022281479090452194, 0.0011471884790807962, 0.002193327760323882, 4.539787187241018e-05, 0.0038893523160368204, 0.0008969513000920415])
dict_values([0.014805383048951626, 0.0016693565994501114, 0.002591328229755163, 4.539787187241018e-05, 0.00471163168549

dict_values([0.04479580372571945, 0.0031246019061654806, 0.004347722977399826, 4.539787187241018e-05, 0.008542060852050781, 0.001978380372747779])
dict_values([0.03890296816825867, 0.0016323646996170282, 0.0030763011891394854, 4.539787187241018e-05, 0.00674314284697175, 0.0012302856193855405])
dict_values([0.03304434195160866, 0.001341545837931335, 0.0034276065416634083, 4.539787187241018e-05, 0.005719159264117479, 0.0015673767775297165])
dict_values([0.038132309913635254, 0.0017904225969687104, 0.0026526255533099174, 4.539787187241018e-05, 0.005544756073504686, 0.0006853293161839247])
dict_values([0.2580791413784027, 0.0173860564827919, 0.05083079636096954, 0.0002546400355640799, 0.08451355993747711, 0.016442900523543358])
dict_values([0.044412944465875626, 0.0017401924123987556, 0.0027595898136496544, 4.539787187241018e-05, 0.00363996927626431, 0.0005747741088271141])
dict_values([0.03890296816825867, 0.0016323646996170282, 0.0030763011891394854, 4.539787187241018e-05, 0.006743142846

dict_values([0.03890296816825867, 0.0016323646996170282, 0.0030763011891394854, 4.539787187241018e-05, 0.00674314284697175, 0.0012302856193855405])
dict_values([0.03304434195160866, 0.001341545837931335, 0.0034276065416634083, 4.539787187241018e-05, 0.005719159264117479, 0.0015673767775297165])
dict_values([0.04479580372571945, 0.0031246019061654806, 0.004347722977399826, 4.539787187241018e-05, 0.008542060852050781, 0.001978380372747779])
dict_values([0.042233679443597794, 0.0018844197038561106, 0.004467296414077282, 4.539787187241018e-05, 0.008082793094217777, 0.0013656046940013766])
dict_values([0.022281479090452194, 0.0011471884790807962, 0.002193327760323882, 4.539787187241018e-05, 0.0038893523160368204, 0.0008969513000920415])
dict_values([0.03890296816825867, 0.0016323646996170282, 0.0030763011891394854, 4.539787187241018e-05, 0.00674314284697175, 0.0012302856193855405])
dict_values([0.11786646395921707, 0.012342091649770737, 0.0225022304803133, 4.539787187241018e-05, 0.032529663

dict_values([0.02606717310845852, 0.0011605340987443924, 0.002078633988276124, 4.539787187241018e-05, 0.004166835453361273, 0.000896348909009248])
dict_values([0.04233590140938759, 0.0019640959799289703, 0.002476820722222328, 4.539787187241018e-05, 0.00809075590223074, 0.0017023885156959295])
dict_values([0.022281479090452194, 0.0011471884790807962, 0.002193327760323882, 4.539787187241018e-05, 0.0038893523160368204, 0.0008969513000920415])
dict_values([0.03890296816825867, 0.0016323646996170282, 0.0030763011891394854, 4.539787187241018e-05, 0.00674314284697175, 0.0012302856193855405])
dict_values([0.022281479090452194, 0.0011471884790807962, 0.002193327760323882, 4.539787187241018e-05, 0.0038893523160368204, 0.0008969513000920415])
dict_values([0.042233679443597794, 0.0018844197038561106, 0.004467296414077282, 4.539787187241018e-05, 0.008082793094217777, 0.0013656046940013766])
dict_values([0.02235550805926323, 0.0007713307277299464, 0.0021013582590967417, 4.539787187241018e-05, 0.0033

In [23]:
doc

e

In [25]:
testX[0]

'yo bitch ja rule is more succesful then youll ever be whats up with you and hating you sad mofuckasi should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me ja rule is about pride in da music man dont diss that shit on him and nothin is wrong bein like tupac he was a brother toofuckin white boys get things right next time'