# Baseline Model - Toxic Comment Classification

* load data and preprocess
* define train test split
* define architecture and compile the model
* train the model
* evaluate the model


## Multilabel Logistic Regression

Multi-label classification assigns to each sample a set of target labels. Toxic comments can have one or multiple of the following labels:

* toxic
* severe_toxic
* obscene
* insult
* identity_hate
* threat

### Import

In [None]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import seaborn as sns

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve 
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline


stop_words = set(stopwords.words('english'))

In [None]:
RSEED = 42
TEST_SIZE = 0.33

TRAIN_PATH = 'data/train.csv'
EVAL_PATH = 'data/train.csv'
TEST_PATH = 'data/train.csv'

categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

### Preprocessing

An dieser Stelle, können wir Funktionen aus dem preprocessing notebook aufrufen.

TODO: import preprocessing notebook

In [None]:
def load_data(path):
    df = pd.read_csv(path)
    return df

In [None]:
def data_split(X, Y):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RSEED)
    return X_train, X_test, y_train, y_test

In [None]:
# Basic text cleaning
# TODO: Replace with functions from preprocessing notebook
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [None]:
df = load_data(TRAIN_PATH)

In [None]:
# clean comment_text
df['comment_text'] = df['comment_text'].map(lambda com : clean_text(com))

### Multi Label Logistic Regression


#### Setup and Train the Classifier

In [None]:
# Train Test Split
X = df['comment_text']
Y = df[categories]
X_train, X_test, y_train, y_test = data_split(X, Y)

In [None]:
# Initialize Logistic Regression Pipeline
multi_label_clf = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
            ])

In [None]:
accuracy_list = []
roc_auc_scores = []
fpr_list = []
tpr_list = []
treshold_list = []


for category in categories:
    print('... Processing {}'.format(category))
    # train the model with X and y train
    multi_label_clf.fit(X_train, y_train[category])
    # compute the testing accuracy and roc auc score
    prediction = multi_label_clf.predict(X_test)
    score = accuracy_score(y_test[category], prediction)
    accuracy_list.append(score)
    #roc_auc = roc_auc_score(y_test[category], prediction)
    #roc_auc_scores.append(roc_auc)
    ## TODO: Add F1 score
    
    y_score = multi_label_clf.fit(X_train, y_train[category]).decision_function(X_test)
    fpr, tpr, tresh = roc_curve(y_test[category], y_score, pos_label=1, drop_intermediate=False)
    fpr_list.append(fpr)
    tpr_list.append(tpr)
    treshold_list.append(tresh)
    roc_auc = roc_auc_score(y_test[category], y_score)
    roc_auc_scores.append(roc_auc)
    
    
    print('ROC AUC score is {}'.format(roc_auc))
    print('Test accuracy is {}'.format(score))

In [None]:
plt.subplots(1, figsize=(10,10))
plt.title('Receiver Operating Characteristic')
for i in range(len(fpr_list)):
    plt.plot(fpr_list[i], tpr_list[i], label= 'ROC curve (area = %0.2f)' % roc_auc_scores[i])
plt.plot([0, 1], ls="--", label='baseline')
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc="lower right")
plt.show()

In [None]:
avg_accuracy = sum(accuracy_list) / len(accuracy_list)
print('Average accuracy is {}'.format(avg_accuracy))

avg_roc_auc = sum(roc_auc_scores) / len(roc_auc_scores)
print('Average ROC AUC Score is {}'.format(avg_roc_auc))

In [None]:
'''
# TODO: Classification Report Multi-Label
# Classification report
model_report = classification_report(y_test, y_pred)
print(model_report)

# TODO: Confusion Matrix Mult-Label
# Confusion Matrix
model_conf = confusion_matrix(y_test, y_pred)
print(model_conf)
'''
pass