# Baseline Model - Toxic Comment Classification

* load data and preprocess
* define train test split
* define architecture and compile the model
* train the model
* evaluate the model


## Multilabel Logistic Regression

Multi-label classification assigns to each sample a set of target labels. Toxic comments can have one or multiple of the following labels:

* toxic
* severe_toxic
* obscene
* insult
* identity_hate
* threat

### Import & Setup

In [None]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import seaborn as sns

from itertools import cycle
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, auc, confusion_matrix, classification_report, f1_score, roc_auc_score, roc_curve 
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import label_binarize

stop_words = set(stopwords.words('english'))

In [None]:
RSEED = 42
TEST_SIZE = 0.33

TRAIN_PATH = 'data/train.csv'
TEST_PATH = 'data/test.csv'

categories = ['toxic', 'severe_toxic', 'obscene', 'threat',  'insult', 'identity_hate']

### Preprocessing

TODO: import preprocessing notebook and use its functions

In [None]:
def load_data(path):
    df = pd.read_csv(path)
    return df

In [None]:
def data_split(X, Y):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RSEED)
    return X_train, X_test, y_train, y_test

In [None]:
# Basic text cleaning
# TODO: Replace with functions from preprocessing notebook
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [None]:
# load data
df = load_data(TRAIN_PATH)

In [None]:
# clean comment_text column
df['comment_text'] = df['comment_text'].map(lambda com : clean_text(com))

### Multi Label Logistic Regression


#### Setup and Train the Classifier

In [None]:
# Train Test Split
X = df['comment_text']
Y = df[categories]
X_train, X_test, y_train, y_test = data_split(X, Y)

In [None]:
# Initialize Logistic Regression Pipeline
multi_label_clf = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag', random_state=RSEED), n_jobs=1)),
            ])

In [None]:
# Train the classifier and make predictions
multi_label_clf.fit(X_train, y_train)
y_pred = multi_label_clf.predict(X_test)

# Multilabel classification report

f1_scores = f1_score(y_test, y_pred, average=None)
avg_f1 = sum(f1_scores) / len(f1_scores)

print(f"Test Set average F1 Score: {avg_f1 * 100}%\n\n")
print(f"Test Set Accuracy : {accuracy_score(y_test, y_pred) * 100}%\n\n") 
print(f"Classification Report : \n\n{classification_report(y_test, y_pred, zero_division=0)}") 

### Create submission file

In [None]:
def create_submission_probabilities(clf, X_test):
    for category in categories:
        test_predictions = multi_label_clf.predict_proba(X_test)
        submissions_df[category] = test_predictions[:, 1]
        
    submissions_df.to_csv('data/submission.csv', index=False)

In [None]:
# Prepare Jigsaw Testset
test_df = load_data(TEST_PATH)
# Clean column with comments
test_df['comment_text'] = test_df['comment_text'].map(lambda com : clean_text(com))

# Define test data
X_test_submission = test_df['comment_text']

# Dataframe for predicted labels
submissions_df = test_df['id'].to_frame()

In [None]:
# Call if you want to calculate probabilities for kaggle test data and write them to submission.csv
create_submission_probabilities(multi_label_clf, X_test_submission)

### Receiver Operating Characteristic (ROC)

See also [sklearn documentation for ROC](https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html). 

In [None]:
# binarize the output
y = label_binarize(Y, classes=[0, 1, 2, 3, 4, 5])
n_classes = y.shape[1]

n_samples, n_features = X.to_frame().shape

# train test split
X_train, X_test, y_train, y_test = data_split(X, y)

# Learn to predict each class against the other
y_score = multi_label_clf.fit(X_train, y_train).decision_function(X_test)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

#### Plot ROC curves for the multiclass problem

In [None]:
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Plot all ROC curves
plt.figure()

colors = cycle(["aqua", "darkorange", "cornflowerblue"])
for i, color in zip(range(n_classes), colors):
    plt.plot(
        fpr[i],
        tpr[i],
        color=color,
        lw=2,
        label="ROC curve of class {0} (area = {1:0.5f})".format(categories[i], roc_auc[i]),
    )

plt.plot([0, 1], [0, 1], "k--", lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("'Receiver Operating Characteristic')")
plt.legend(loc="lower right")
plt.show()

#### Area under ROC for the multiclass problem

In [None]:
y_prob = multi_label_clf.predict_proba(X_test)

macro_roc_auc_ovr = roc_auc_score(y_test, y_prob, multi_class="ovr", average="macro")
weighted_roc_auc_ovr = roc_auc_score(
    y_test, y_prob, multi_class="ovr", average="weighted"
)
print(
    "One-vs-Rest ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
    "(weighted by prevalence)".format(macro_roc_auc_ovr, weighted_roc_auc_ovr)
)