In [74]:
import time
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from gensim.models import word2vec
from sklearn.neural_network import MLPClassifier

# 1.Data pre-processing

In [75]:
train=pd.read_csv('../toxiccomment/train.csv')
test=pd.read_csv('../toxiccomment/test.csv')
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [76]:
train.shape

(159571, 8)

In [77]:
train['toxic'].value_counts()

0    144277
1     15294
Name: toxic, dtype: int64

In [78]:
train['len'] = train['comment_text'].str.len()
print('Average comment length: %d' % train['len'].mean())
print('Median comment length: %d' % train['len'].quantile(.5))
print('90th percentile comment length: %d' % train['len'].quantile(.9))

Average comment length: 394
Median comment length: 205
90th percentile comment length: 889


In [79]:
print('toxic: %d' %train[train['toxic']>0]['toxic'].count())
print('severe_toxic: %d' %train[train['severe_toxic']>0]['severe_toxic'].count())
print('obscene: %d' %train[train['obscene']>0]['obscene'].count())
print('threat: %d' %train[train['threat']>0]['threat'].count())
print('insult: %d' %train[train['insult']>0]['insult'].count())
print('identity_hate: %d' %train[train['identity_hate']>0]['identity_hate'].count())

toxic: 15294
severe_toxic: 1595
obscene: 8449
threat: 478
insult: 7877
identity_hate: 1405


In [80]:
print('severe toxic and toxic: %d' %train[(train['severe_toxic']==1) & (train['toxic']==1)]['id'].count())

severe toxic and toxic: 1595


This shows that every severe toxic comment is toxic

In [81]:
print('obscene and toxic: %d' %train[(train['obscene']==1) & (train['toxic']==1)]['id'].count())

obscene and toxic: 7926


This shows that all obscene comments are not toxic

In [82]:
print('threat and toxic: %d' %train[(train['threat']==1) & (train['toxic']==1)]['id'].count())

threat and toxic: 449


This shows that all threat comments are not toxic

In [83]:
print('insult and toxic: %d' %train[(train['insult']==1) & (train['toxic']==1)]['id'].count())

insult and toxic: 7344


This shows that all insult comments are not toxic

In [84]:
print('identity hate and toxic: %d' %train[(train['identity_hate']==1) & (train['toxic']==1)]['id'].count())

identity hate and toxic: 1302


This shows that all identity hate comments are not toxic

In [85]:
print(train[train['comment_text'].isnull()])

Empty DataFrame
Columns: [id, comment_text, toxic, severe_toxic, obscene, threat, insult, identity_hate, len]
Index: []


In [86]:
print(test[test['comment_text'].isnull()])

Empty DataFrame
Columns: [id, comment_text]
Index: []


In [87]:
print(train['comment_text'].isnull().values.any())
print(test['comment_text'].isnull().values.any())

False
False


In [88]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,len
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,264
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,112
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,233
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,622
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,67


This shows that there are no missing values of comment text in the training dataset and the testing dataset

In [89]:
test['comment_text'].fillna(value='none', inplace=True)
train['comment_text'].fillna(value='none', inplace=True)

In [90]:
import re
import string
from nltk import word_tokenize
stopwords= set(nltk.corpus.stopwords.words('english'))


def div2word(raw_text, remove_stopwords=False):
    #Remove non-letters, but including numbers
    letters_only = re.sub("[^0-9a-zA-Z]", " ", raw_text)
    #Convert to lower case, split into individual words
    words = letters_only.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english")) # In Python, searching a set is much faster than searching
        meaningful_words = [w for w in words if not w in stops] # Remove stop words
        words = meaningful_words
    return words 

### The comments are word tokenized and stopwords are removed below

In [91]:
#example
sen='This is a big sentence containing BIg words and the is the stop word we expect it to be gone'
print(div2word(sen))

['this', 'is', 'a', 'big', 'sentence', 'containing', 'big', 'words', 'and', 'the', 'is', 'the', 'stop', 'word', 'we', 'expect', 'it', 'to', 'be', 'gone']


In [92]:
pp_comment_text=train['comment_text'].apply(div2word)

# 2.Word2Vec model parameters

In [93]:
wv_size=300 #var for the size or the dimension of the word vector
min_wrd_cnt=40 #words that occur less than 30 times are ignored and not considered in the word vector
num_workers=4 #it determines the degree of parallelism;determines how many threads will execute in parallel to train the model
window_size=10 #it determines the size of the window; it is the max distance between the current word and the word being predicted
dsample=1e-3 #downsample setting for frequently occuring words;it improves the execution time


model=word2vec.Word2Vec(pp_comment_text,workers=num_workers,size=wv_size,min_count=min_wrd_cnt,window=window_size,sample=dsample)
model.init_sims(replace=True) 

### Saving word2vec model for use in live input in flask api

In [94]:
joblib.dump(model,'w2v_fm')
w2v=joblib.load('w2v_fm')

Now we extract the average feature vector for each of the comments by using two functions as below

In [95]:
def makeFeatureVec(words, model, wv_size):
    featureVec = np.zeros((wv_size,), dtype="float32")#preallocation of numpy array for speed
    nwords = 0

    index2word_set = set(model.wv.index2word)

    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1
            featureVec = np.add(featureVec, model[word])
    # Dividing the result by the number of words to get the average
    if nwords == 0:
        nwords = 1
    featureVec = np.divide(featureVec, nwords)
    return featureVec

def getAvgFeatureVecs(comments, model, wv_size):
    #calculation of average feature vector using the makeFeatureVec function for each of the comments and storing it in a 2D array
    #preallocation of numpy array for speed
    reviewFeatureVecs = np.zeros((len(comments), wv_size), dtype="float32")
    counter = 0
    #loop through the comments
    for comment in comments:
        # Call the function (defined above) that makes average feature vectors
        reviewFeatureVecs[counter] = makeFeatureVec(comment, model, wv_size)
        counter = counter + 1
    return reviewFeatureVecs

In [96]:
f_matrix_train = getAvgFeatureVecs(pp_comment_text, model,wv_size)
#train 6 different models with 6 different Y labels
y = [train['toxic'], train['severe_toxic'], train['obscene'], train['threat'], train['insult'], train['identity_hate']]

  # Remove the CWD from sys.path while we load stuff.


# 3.MLP Model for each toxicity level

Now we create 6 multi layer perceptron models(MLP); one for each of the toxicity level

In [97]:

MODEL = []
for i in range(0, 6):
    m = MLPClassifier(solver='adam', hidden_layer_sizes=(30,30,30), random_state=1)
    #since the training set has thousands of data entries, we use stochastic gradient-based optimizer;thus solver=adam
    MODEL.append(m)
print(MODEL)

[MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(30, 30, 30), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False), MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(30, 30, 30), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False), MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
   

### Training the model for the comments by partial fit method

In [98]:
batch_size = 8000
total_rows = f_matrix_train.shape[0]
duration = 0
start_train = time.time()
pos = 0
classes = [0,1]
# we use a partial fit approach
while duration < 2500 and pos < total_rows:
    for i in range(0, 6):
        if pos+batch_size > total_rows:
            batch_size = total_rows-pos
        X_p = f_matrix_train[pos:pos+batch_size]
        y_p = y[i][pos:pos+batch_size]
        MODEL[i].partial_fit(X_p, y_p, classes)
    pos = pos + batch_size
    duration = time.time() - start_train # for how much cumulative time we trained 
    print("Pos %d/%d duration %d" % (pos, total_rows, duration))

Pos 8000/159571 duration 0
Pos 16000/159571 duration 0
Pos 24000/159571 duration 1
Pos 32000/159571 duration 1
Pos 40000/159571 duration 2
Pos 48000/159571 duration 2
Pos 56000/159571 duration 2
Pos 64000/159571 duration 3
Pos 72000/159571 duration 3
Pos 80000/159571 duration 4
Pos 88000/159571 duration 4
Pos 96000/159571 duration 5
Pos 104000/159571 duration 5
Pos 112000/159571 duration 5
Pos 120000/159571 duration 6
Pos 128000/159571 duration 6
Pos 136000/159571 duration 7
Pos 144000/159571 duration 7
Pos 152000/159571 duration 7
Pos 159571/159571 duration 8


# 4. Using the trained model to predict toxicity percentage for sample inputs

In [99]:
from sklearn.externals import joblib

### Saving the trained model using joblib

In [100]:
joblib.dump(MODEL,'final_trained_model')

['final_trained_model']

In [101]:
TM=joblib.load('final_trained_model')

### Taking 2 input sentences and  predicting their toxicity using trained model

In [119]:
a=input()

This is a toxic comment. This is bullshit. I hate this and I will kill you. I will murder you.


In [120]:
a2=div2word(a)

In [121]:
a3=makeFeatureVec(a2,model,wv_size)

  # Remove the CWD from sys.path while we load stuff.


In [122]:
b=input()

This is a non toxic comment. This website is so helpful I cannot believe how much easier it is to operate and it is so helpful. Thank you so much for this article it has helped a lot.


In [123]:
b2=div2word(b)

In [124]:
b3=makeFeatureVec(b2,model,wv_size)

  # Remove the CWD from sys.path while we load stuff.


In [125]:
ANS=[]
ANS.append(a3)
ANS.append(b3)

In [126]:
txc=(TM[0].predict_proba(ANS)[:,1])*100.000
svr_txc=(TM[1].predict_proba(ANS)[:,1])*100.000
obc=(TM[2].predict_proba(ANS)[:,1])*100.000
thrt=(TM[3].predict_proba(ANS)[:,1])*100.000
inslt=(TM[4].predict_proba(ANS)[:,1])*100.000
idnt_ht=(TM[5].predict_proba(ANS)[:,1])*100.000

### Printing the prediction probability of toxicity for each of the sentences

In [127]:
print('Toxicity percentage')
print('Comment1: ',float(txc[0]))
print('Comment2: ',float(txc[1]))

Toxicity percentage
Comment1:  90.77259488795862
Comment2:  0.4604134367024115


In [128]:
print('Severe Toxicity percentage')
print('Comment1: ',float(svr_txc[0]))
print('Comment2: ',float(svr_txc[1]))

Severe Toxicity percentage
Comment1:  1.7069584387244172
Comment2:  0.012938998495716926


In [129]:
print('Obscenity percentage')
print('Comment1: ',float(obc[0]))
print('Comment2: ',float(obc[1]))

Obscenity percentage
Comment1:  40.78678608968722
Comment2:  0.11035531651982192


In [130]:
print('Threat percentage')
print('Comment1: ',float(thrt[0]))
print('Comment2: ',float(thrt[1]))

Threat percentage
Comment1:  0.9737237462675925
Comment2:  0.006309087714284689


In [131]:
print('Insult percentage')
print('Comment1: ',float(inslt[0]))
print('Comment2: ',float(inslt[1]))

Insult percentage
Comment1:  42.52109440180824
Comment2:  0.14919644247504035


In [132]:
print('Identity Hate percentage')
print('Comment1: ',float(idnt_ht[0]))
print('Comment2: ',float(idnt_ht[1]))

Identity Hate percentage
Comment1:  2.28454934607293
Comment2:  0.032143664185392634


# 5. Using the trained model for the test dataset

In [133]:
pp_comment_test=test['comment_text'].apply(div2word)

In [134]:
result = test[['id']].copy()

In [135]:
f_matrix_test = getAvgFeatureVecs(pp_comment_test, model,wv_size)

  # Remove the CWD from sys.path while we load stuff.


In [136]:
result['toxic'] = MODEL[0].predict_proba(f_matrix_test)[:,1]
result['severe_toxic'] = MODEL[1].predict_proba(f_matrix_test)[:,1]
result['obscene'] = MODEL[2].predict_proba(f_matrix_test)[:,1]
result['threat'] = MODEL[3].predict_proba(f_matrix_test)[:,1]
result['insult'] = MODEL[4].predict_proba(f_matrix_test)[:,1]
result['identity_hate'] = MODEL[5].predict_proba(f_matrix_test)[:,1]

In [137]:
result.to_csv('submission.csv', encoding='utf-8', index=False)

## The model is succesfully used to predict the toxicity of comments in the test dataset!!