In [9]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

import collections
import re
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk import word_tokenize, sent_tokenize
from string import punctuation
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer


#cleaning the available text to get clean data
def cleanText(text):

    stopw = stopwords.words('english')

    # remove 'not' from the stopwords while this can negate an insult
    # decided to add 'you' while in toxic conversations you is used to enhance te meaning
    #TODO: enhance comments
    stopw.remove('not')
    stopw.remove('you')
    stopw.remove('your')
    stopw.remove('you\'re')
    stopw.remove('are')

    #make the whole text lowercase so we don't make differences between capitalization
    text = text.lower()

    #subbing words to match cleaner words.
    text = re.sub("\'s", " ", text)
    text = re.sub(" whats ", " what is ", text, flags=re.IGNORECASE)
    text = re.sub("\'n't ", " not ", text, flags=re.IGNORECASE)
    text = re.sub(" n't ", " not ", text, flags=re.IGNORECASE)
    text = re.sub("I'm", "I am", text)
    text = re.sub("shouldn\'t", " should not ", text, flags=re.IGNORECASE)
    text = re.sub("were\'nt", " were not ", text, flags=re.IGNORECASE)
    text = re.sub("can't", " can not ", text, flags=re.IGNORECASE)
    text = re.sub("\'ve", " have ", text)
    text = re.sub("\'ll", " will ", text)

    #remove all the stopwords and remove all non letters
    words = word_tokenize(text)
    tokens = [word for word in words if word not in stopw]

    #remove all non letters in the dataset
    tokens = [word for word in tokens if re.match(r'[^\W\d]*$', word)]

    #remove all URLs in the dataset
    # TODO: write a regex for this

    text = ' '.join(tokens)

    #remove punctuation
    text = ''.join([word for word in text if word not in punctuation])

    #dealing with empty data line
    if type(text) != str or text == '':
        return ''


    cleaned_text = text
    return cleaned_text


In [10]:
# importing the training data using pandas.
df_train = pd.read_csv('Data/train.csv')
# save the original text to easily inspect it and derive what has to be cleaned
# df_train['comment_text'].to_csv('Data/OriginalText.csv')

# save the cleaned text to easily inspect it
# df_train['comment_text'].to_csv('Data/cleanedText.csv')

# clean the text
df_train['comment_text'] = df_train['comment_text'].apply(cleanText)


In [11]:
def averageVecValue(comment, model, vectorSize, vocab):
    Vector = np.zeros(vectorSize)
    
    for word in comment:
        if word in vocab:
            Vector += np.array(model.wv.get_vector(word))
    
    Vector_value = np.divide(Vector, vectorSize)
    
    return Vector_value.tolist()

In [12]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

def word2Vec(cleanedData, dataSet):
    dataSet['comment_text_tokenized'] = dataSet['comment_text'].apply(word_tokenize)
    tokens = df_train['comment_text_tokenized']
    
    vectorSize = 300
    word2vec = Word2Vec(tokens,min_count = 2, size = vectorSize)
    vocab = word2vec.wv.vocab
    
    vectorizedData = []
    for index, row in dataSet.iterrows():
        vectorizedData.append(averageVecValue(row['comment_text'], word2vec, vectorSize, vocab))
        
    
    return vectorizedData


In [34]:
# setting up the X training comments (vectorize them to be able to be used as input for model) and Y training labels
print("setting up training data ")
df_train = df_train.sample(n=25000, random_state=33)
Xtrain = word2Vec(df_train['comment_text'], df_train)
Ytrain = df_train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]


# importing the test data set to test the algorithm
df_test = pd.read_csv('Data/test.csv')


#run the algorithm for the first time and get an idea of the accuracy with the basic parameters.
print("started training the model")
rf_model = RandomForestClassifier()
# rf_model.fit(Xtrain, Ytrain)

# test the accuracy of the model on a split training dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

xtrain, xtest,ytrain,ytest = train_test_split(Xtrain,Ytrain,test_size=0.4, random_state=33)
rf_model.fittransform(xtrain, ytrain)
print("RF Accuracy: %0.2f%%" % (100 * rf_model.score(xtest, ytest)))



    
# # testing score
# score = metrics.f1_score(ytest, rf_model.predict(ytest), pos_label=list(set(ytest)))
# print(score)

# # training score
# print()
# score_train = metrics.f1_score(ytrain, rf_model.predict(ytrain), pos_label=list(set(ytrain)))
# print(score_train)




setting up training data 
started training the model




RF Accuracy: 88.29%


In [24]:
 # setting up de X test comments to test the algorithm
# Xtest = word2Vec(df_test)