In [1]:
%pylab inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier


plt.rcParams['figure.dpi']= 150
sns.set(style="whitegrid")

Populating the interactive namespace from numpy and matplotlib


In [2]:
# Read raw training data
data = pd.read_csv("cleaned_train_data.csv")
kaggle_test = pd.read_csv("cleaned_test_data.csv")
categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Naive Bayes 0.91951

In [4]:
X_train_all = data.comment_text.values
X_test_kaggle = kaggle_test.comment_text.fillna("UNK").values
# Initialise CountVectorizer and fit all the training data
count_vect = CountVectorizer(stop_words="english")
X_train_all_vec = count_vect.fit_transform(X_train_all)
print('X_train_all_vec.shape: ', X_train_all_vec.shape)

X_test_kaggle_vec = count_vect.transform(X_test_kaggle)
print('X_test_kaggle_vec.shape: ', X_test_kaggle_vec.shape)

X_train_all_vec.shape:  (159571, 59188)
X_test_kaggle_vec.shape:  (153164, 59188)


In [26]:
test_output = pd.DataFrame()
for category in categories:
    clf = MultinomialNB().fit(X_train_all_vec, data[category])
    predictProbs = clf.predict_proba(X_test_kaggle_vec)
    predictProbs = predictProbs[:, 1]
    test_output[category] = predictProbs

test_output = pd.concat([kaggle_test["id"], test_output], axis=1)
test_output.to_csv("naive_bayes_submission.csv", index = False)
        

# Decision Tree 0.84331

In [31]:
X_train_all = data.comment_text.values
X_test_kaggle = kaggle_test.comment_text.fillna("UNK").values

# Initialise CountVectorizer and fit all the training data
tfidf_vect = TfidfVectorizer(stop_words="english")
X_train_all_vec = tfidf_vect.fit_transform(X_train_all)
print('X_train_all_vec.shape: ', X_train_all_vec.shape)

X_test_kaggle_vec = tfidf_vect.transform(X_test_kaggle)
print('X_test_kaggle_vec.shape: ', X_test_kaggle_vec.shape)

X_train_all_vec.shape:  (159571, 59188)
X_test_kaggle_vec.shape:  (153164, 59188)


In [32]:
category_dep = {'toxic': 63, 'severe_toxic': 6, 'obscene': 20, 'threat': 7, 'insult': 15, 'identity_hate': 6}

test_output = pd.DataFrame()
for category in categories:
    clf = DecisionTreeClassifier(criterion="entropy", max_depth=category_dep[category]).fit(X_train_all_vec, data[category])
    predictProbs = clf.predict_proba(X_test_kaggle_vec)
    predictProbs = predictProbs[:, 1]
    test_output[category] = predictProbs

test_output = pd.concat([kaggle_test["id"], test_output], axis=1)
test_output.to_csv("decision_tree_submission.csv", index = False)

# Logistic Regression 0.97452

In [5]:
X_train_all = data.comment_text.values
X_test_kaggle = kaggle_test.comment_text.fillna("UNK").values

tfidf_vect = TfidfVectorizer(stop_words="english")
X_train_all_vec = tfidf_vect.fit_transform(X_train_all)
print('X_train_all_vec.shape: ', X_train_all_vec.shape)

X_test_kaggle_vec = tfidf_vect.transform(X_test_kaggle)
print('X_test_kaggle_vec.shape: ', X_test_kaggle_vec.shape)

X_train_all_vec.shape:  (159571, 59188)
X_test_kaggle_vec.shape:  (153164, 59188)


In [6]:
categories = {'toxic': 4, 'severe_toxic': 0.4, 'obscene': 4, 'threat': 10, 'insult': 4, 'identity_hate': 4}
categoryName = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [9]:
test_output = pd.DataFrame()
for category in categories:
    clf = LogisticRegression(dual=True, C=categories[category], solver="liblinear").fit(X_train_all_vec, data[category])
    predictProbs = clf.predict_proba(X_test_kaggle_vec)
    predictProbs = predictProbs[:, 1]
    test_output[category] = predictProbs
test_output = pd.concat([kaggle_test["id"], test_output], axis=1)
test_output.to_csv("LR_submission.csv", index = False)

# SVM 0.97700

In [41]:
X_train_all = data.comment_text.values
X_test_kaggle = kaggle_test.comment_text.fillna("UNK").values

# Initialise TfidfVectorizer and fit all the training data
tfidf_vect = TfidfVectorizer(stop_words="english")
X_train_all_vec = tfidf_vect.fit_transform(X_train_all)
print('X_train_all_vec.shape: ', X_train_all_vec.shape)

X_test_kaggle_vec = tfidf_vect.transform(X_test_kaggle)
print('X_test_kaggle_vec.shape: ', X_test_kaggle_vec.shape)

X_train_all_vec.shape:  (159571, 59188)
X_test_kaggle_vec.shape:  (153164, 59188)


In [14]:
test_output = pd.DataFrame()
for category in categories:
    clf = CalibratedClassifierCV(LinearSVC(C=0.1), cv=10).fit(X_train_all_vec, data[category])
    predictProbs = clf.predict_proba(X_test_kaggle_vec)
    predictProbs = predictProbs[:, 1]
    test_output[category] = predictProbs
    
test_output = pd.concat([kaggle_test["id"], test_output], axis=1)
test_output.to_csv("SVM_submission.csv", index = False)


# Random Forest 0.93086

In [44]:
X_train_all = data.comment_text.values
X_test_kaggle = kaggle_test.comment_text.fillna("UNK").values

# Initialise CountVectorizer and fit all the training data
tfidf_vect = TfidfVectorizer(stop_words="english")
X_train_all_vec = tfidf_vect.fit_transform(X_train_all)
print('X_train_all_vec.shape: ', X_train_all_vec.shape)

X_test_kaggle_vec = tfidf_vect.transform(X_test_kaggle)
print('X_test_kaggle_vec.shape: ', X_test_kaggle_vec.shape)

X_train_all_vec.shape:  (159571, 59188)
X_test_kaggle_vec.shape:  (153164, 59188)


In [50]:
category_dep = {'toxic': 35, 'severe_toxic': 10, 'obscene': 15, 'threat': 10, 'insult': 35, 'identity_hate': 15}

test_output = None
test_output = pd.DataFrame()
for category in categories:
    clf = RandomForestClassifier(criterion="entropy", n_estimators=category_dep[category]).fit(X_train_all_vec, data[category])
    predictProbs = clf.predict_proba(X_test_kaggle_vec)
    predictProbs = predictProbs[:, 1]
    test_output[category] = predictProbs
    
test_output = pd.concat([kaggle_test["id"], test_output], axis=1)
test_output.to_csv("random_forest_submission.csv", index = False)

# Boosting 0.95259

In [3]:
X_train_all = data.comment_text.values
X_test_kaggle = kaggle_test.comment_text.fillna("UNK").values

# Initialise CountVectorizer and fit all the training data
tfidf_vect = TfidfVectorizer(stop_words="english")
X_train_all_vec = tfidf_vect.fit_transform(X_train_all)
print('X_train_all_vec.shape: ', X_train_all_vec.shape)

X_test_kaggle_vec = tfidf_vect.transform(X_test_kaggle)
print('X_test_kaggle_vec.shape: ', X_test_kaggle_vec.shape)

X_train_all_vec.shape:  (159571, 59188)
X_test_kaggle_vec.shape:  (153164, 59188)


In [4]:
category_dep = {'toxic': 300, 'severe_toxic': 150, 'obscene': 225, 'threat': 50, 'insult': 250, 'identity_hate': 450}

test_output = None
test_output = pd.DataFrame()
for category in categories:
    clf = AdaBoostClassifier(n_estimators=category_dep[category]).fit(X_train_all_vec, data[category])
    predictProbs = clf.predict_proba(X_test_kaggle_vec)
    predictProbs = predictProbs[:, 1]
    test_output[category] = predictProbs
    
test_output = pd.concat([kaggle_test["id"], test_output], axis=1)
test_output.to_csv("boosting_submission.csv", index = False)