In [6]:
import time

# import necessary libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#text libraries
import re
import nltk
from nltk.corpus import stopwords
import gensim

# classifier imports
from sklearn.neural_network import MLPClassifier

In [2]:
# Read data
train = pd.read_csv('Toxic_comments/train.csv')
test = pd.read_csv('Toxic_comments/test.csv')
# copy test id column for later submission
result = test[['id']].copy() 
# show first 3 rows of the training set to get a first impression about the data
print(train.head(3))

                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  


In [3]:
test['comment_text'].fillna(value='none', inplace=True) # there is one 
train['comment_text'].fillna(value='none', inplace=True) 

In [4]:
def text_to_words(raw_text, remove_stopwords=False):
    # 1. Remove non-letters, but including numbers
    letters_only = re.sub("[^0-9a-zA-Z]", " ", raw_text)
    # 2. Convert to lower case, split into individual words
    words = letters_only.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english")) # In Python, searching a set is much faster than searching
        meaningful_words = [w for w in words if not w in stops] # Remove stop words
        words = meaningful_words
    return words 

sentences_train = train['comment_text'].apply(text_to_words, remove_stopwords=False)
sentences_test = test['comment_text'].apply(text_to_words, remove_stopwords=False)
# show first three arrays as sample
print(sentences_train[:3])

0    [explanation, why, the, edits, made, under, my...
1    [d, aww, he, matches, this, background, colour...
2    [hey, man, i, m, really, not, trying, to, edit...
Name: comment_text, dtype: object


In [8]:
google_model = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

In [None]:
num_features = 300
def makeFeatureVec(words, model, num_features):
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,), dtype="float32")
    #
    nwords = 0
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.wv.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1
            featureVec = np.add(featureVec, model[word])
    # Divide the result by the number of words to get the average
    if nwords == 0:
        nwords = 1
    featureVec = np.divide(featureVec, nwords)
    return featureVec

def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews), num_features), dtype="float32")
    counter = 0
    # Loop through the reviews
    for review in reviews:
        # Call the function (defined above) that makes average feature vectors
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        counter = counter + 1
    return reviewFeatureVecs

f_matrix_train = getAvgFeatureVecs(sentences_train, google_model, num_features)
f_matrix_test = getAvgFeatureVecs(sentences_test, google_model, num_features)
# we have to train 6 different models with 6 different Y labels
y = [train['toxic'], train['severe_toxic'], train['obscene'], train['threat'], train['insult'], train['identity_hate']]

  # Remove the CWD from sys.path while we load stuff.


In [9]:
# create 6 MLP models
model = []
for i in range(0, 6):
    m = MLPClassifier(solver='adam', hidden_layer_sizes=(30,30,30), random_state=1)
    model.append(m)
print(model)

[MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(30, 30, 30), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False), MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(30, 30, 30), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False), MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, ep

In [None]:
batch_size = 10000
total_rows = f_matrix_train.shape[0]
duration = 0
start_train = time.time()
pos = 0
classes = [0,1]
# we use a partial fit approach
while duration < 2500 and pos < total_rows:
    for i in range(0, 6):
        if pos+batch_size > total_rows:
            batch_size = total_rows-pos
        X_p = f_matrix_train[pos:pos+batch_size]
        y_p = y[i][pos:pos+batch_size]
        model[i].partial_fit(X_p, y_p, classes)
    pos = pos + batch_size
    duration = time.time() - start_train # how long did we train so far?
    print("Pos %d/%d duration %d" % (pos, total_rows, duration))
    # end test partial fit 

In [None]:
result['toxic'] = model[0].predict_proba(f_matrix_test)[:,1]
result['severe_toxic'] = model[1].predict_proba(f_matrix_test)[:,1]
result['obscene'] = model[2].predict_proba(f_matrix_test)[:,1]
result['threat'] = model[3].predict_proba(f_matrix_test)[:,1]
result['insult'] = model[4].predict_proba(f_matrix_test)[:,1]
result['identity_hate'] = model[5].predict_proba(f_matrix_test)[:,1]