# Baseline Model - Toxic Comment Classification

## Import

In [None]:
# TODO: Add all imports here
%matplotlib inline
import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import roc_auc_score
import seaborn as sns

In [None]:
# TODO: define constants
RSEED = 42
TEST_SIZE = 0.33

TRAIN_PATH = 'data/train.csv'
EVAL_PATH = 'data/train.csv'
TEST_PATH = 'data/train.csv'

categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

## Building the Baseline Model

* load data
* define train test split
* define architecture and compile the model
* train the model
* evaluate the model

## Data Preparation

In [None]:
def load_data(path):
    df = pd.read_csv(path)
    return df

In [None]:
def data_split(X, Y):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RSEED)
    return X_train, X_test, y_train, y_test

In [None]:
df = load_data(TRAIN_PATH)

In [None]:
# Basic text cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [None]:
# clean comment_text
df['comment_text'] = df['comment_text'].map(lambda com : clean_text(com))

## Binary Logistic Regression

This classification makes the assumption that a comment is toxic or non-toxic

In [None]:
# Train Test Split
X = df['comment_text']
Y = df['toxic']
X_train, X_test, y_train, y_test = data_split(X, Y)

In [None]:
X_train.head()

In [None]:
# Pipeline
LogReg_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression(random_state=RSEED, solver='lbfgs', max_iter=400)),
])

In [None]:
LogReg_clf.fit(X_train, y_train)

In [None]:
y_pred = LogReg_clf.predict(X_test)

In [None]:
print('Average accuracy is {}'.format(accuracy_score(y_test, y_pred)))
print('Average ROC AUC Score is {}'.format(roc_auc_score(y_test, y_pred)))

## Multi Label Logistic Regression

Multi-label classification assigns to each sample a set of target labels. Toxic comments can have one or multiple of the following labels:

* toxic
* severe_toxic
* obscene
* insult
* identity_hate
* threat

#### Setup and Train the Classifier

In [None]:
# Train Test Split
X = df['comment_text']
Y = df[categories]
X_train, X_test, y_train, y_test = data_split(X, Y)

In [None]:
# Initialize Logistic Regression Pipeline
multi_label_clf = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
            ])

In [None]:
# Training
accuracy_list = []
roc_auc_scores = []

for category in categories:
    print('... Processing {}'.format(category))
    # train the model with X and y train
    multi_label_clf.fit(X_train, y_train[category])
    # compute the testing accuracy and roc auc score
    prediction = multi_label_clf.predict(X_test)
    score = accuracy_score(y_test[category], prediction)
    accuracy_list.append(score)
    roc_auc = roc_auc_score(y_test[category], prediction)
    roc_auc_scores.append(roc_auc)
    print('ROC AUC score is {}'.format(roc_auc))
    print('Test accuracy is {}'.format(score))

In [None]:
avg_accuracy = sum(accuracy_list) / len(accuracy_list)
print('Average accuracy is {}'.format(avg_accuracy))

avg_roc_auc = sum(roc_auc_scores) / len(roc_auc_scores)
print('Average ROC AUC Score is {}'.format(avg_roc_auc))