In [1]:
#LogisticRegression is used

In [2]:
import pandas as pd
import numpy as np
import re
import string
from nltk.stem.wordnet import WordNetLemmatizer
from timeit import default_timer as timer

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score, f1_score, recall_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from statistics import mean
from sklearn.metrics import hamming_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV


from sklearn.pipeline import Pipeline



import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [26]:
#Loading data
train = pd.read_csv("SE/train(T).csv")
test = pd.read_csv("SE/test(T).csv")
test_y = pd.read_csv("SE/test_labels(T).csv")

In [28]:
# example of clean comment
train.comment_text[0]

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [30]:
# example of toxic comment
train[train.toxic == 1].iloc[1, 1]




#Feature-engineering

In [33]:
#Data cleaning

In [35]:
test_labels = ["toxic", "severe_toxic", "obscene","threat", "insult", "identity_hate"]

In [37]:
'''
    Tokenize text and return a non-unique list of tokenized words found in the text. 
    Normalize to lowercase, strip punctuation, remove stop words, filter non-ascii characters.
    Lemmatize the words and lastly drop words of length < 3.
    '''
def tokenize(text):
    text = text.lower()
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text)
    words = nopunct.split(' ')
    # remove any non ascii
    words = [word.encode('ascii', 'ignore').decode('ascii') for word in words]
    lmtzr = WordNetLemmatizer()
    words = [lmtzr.lemmatize(w) for w in words]
    words = [w for w in words if len(w) > 2]
    return words

In [39]:
vector = TfidfVectorizer(ngram_range=(1, 1), analyzer='word',
                         tokenizer=tokenize, stop_words='english',
                         strip_accents='unicode', use_idf=True, min_df=10)
X_train = vector.fit_transform(train['comment_text'])
X_test = vector.transform(test['comment_text'])

#Modeling and Evaluation

In [41]:
# Creating classifiers with default parameters initially.
clf2 = LogisticRegression()

In [42]:
#Iterate though each label and return the cross validation F1 and Recall score 
def cross_validation_score(classifier, X_train, y_train):
    methods = []
    name = classifier.__class__.__name__.split('.')[-1]

    for label in test_labels:
        recall = cross_val_score(
            classifier, X_train, y_train[label], cv=10, scoring='recall')
        f1 = cross_val_score(classifier, X_train,
                             y_train[label], cv=10, scoring='f1')
        methods.append([name, label, recall.mean(), f1.mean()])

    return methods

In [43]:
# Calculating the cross validation F1 and Recall score for our 3 baseline models.

methods2_cv = pd.DataFrame(cross_validation_score(clf2, X_train, train))
print(1)


1


In [44]:
# Creating a dataframe to show summary of results.
methods2_cv.columns = ['Model', 'Label', 'Recall', 'F1']
meth_cv = methods2_cv.reset_index()
meth_cv[['Model', 'Label', 'Recall', 'F1']]

Unnamed: 0,Model,Label,Recall,F1
0,LogisticRegression,toxic,0.612528,0.732122
1,LogisticRegression,severe_toxic,0.255185,0.35121
2,LogisticRegression,obscene,0.638658,0.748026
3,LogisticRegression,threat,0.131605,0.218787
4,LogisticRegression,insult,0.523292,0.636648
5,LogisticRegression,identity_hate,0.212837,0.323868


In [45]:
#Based on the cross validation above, we noticed that overall, the linear SVC model and Logistic Regression model perform better. 
#As a baseline model, Multinomial Naive Bayes does not perform well, especially for the threat label and identity_hate label because these two 
#labels have the least number of observations.

In [46]:
#Now we want to see how these three models perform on the actual prediction - the test dataset.

In [47]:
#Calculate Hamming-loss, F1, Recall for each label on test dataset.

def score(classifier, X_train, y_train, X_test, y_test):
    methods = []
    hloss = []
    name = classifier.__class__.__name__.split('.')[-1]
    predict_df = pd.DataFrame()
    predict_df['id'] = test_y['id']

    for label in test_labels:
        classifier.fit(X_train, y_train[label])
        predicted = classifier.predict(X_test)

        predict_df[label] = predicted

        recall = recall_score(y_test[y_test[label] != -1][label],
                              predicted[y_test[label] != -1],
                              average="weighted")
        f1 = f1_score(y_test[y_test[label] != -1][label],
                      predicted[y_test[label] != -1],
                      average="weighted")

        conf_mat = confusion_matrix(y_test[y_test[label] != -1][label],
                                    predicted[y_test[label] != -1])

        methods.append([name, label, recall, f1, conf_mat])

    hamming_loss_score = hamming_loss(test_y[test_y['toxic'] != -1].iloc[:, 1:7],
                                      predict_df[test_y['toxic'] != -1].iloc[:, 1:7])
    hloss.append([name, hamming_loss_score])

    return hloss, methods

In [48]:
# Calculating the Hamming-loss F1 and Recall score for our 3 baseline models.
h2, methods2 = score(clf2, X_train, train, X_test, test_y)


In [49]:
# Creating a dataframe to show summary of results.
methods2 = pd.DataFrame(methods2)
methods2.columns = ['Model', 'Label', 'Recall', 'F1', 'Confusion_Matrix']
meth = methods2.reset_index()
meth[['Model', 'Label', 'Recall', 'F1']]

Unnamed: 0,Model,Label,Recall,F1
0,LogisticRegression,toxic,0.934321,0.936142
1,LogisticRegression,severe_toxic,0.993154,0.992851
2,LogisticRegression,obscene,0.966066,0.964416
3,LogisticRegression,threat,0.996374,0.995736
4,LogisticRegression,insult,0.963644,0.961003
5,LogisticRegression,identity_hate,0.990419,0.988397
