# Baseline Model - Toxic Comment Classification

* load data and preprocess
* define train test split
* define architecture and compile the model
* train the model
* evaluate the model


## Multilabel Logistic Regression

Multi-label classification assigns to each sample a set of target labels. Toxic comments can have one or multiple of the following labels:

* toxic
* severe_toxic
* obscene
* insult
* identity_hate
* threat

### Import & Setup

In [1]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import seaborn as sns

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score, roc_curve 
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline

stop_words = set(stopwords.words('english'))

In [2]:
RSEED = 42
TEST_SIZE = 0.33

TRAIN_PATH = 'data/train.csv'
TEST_PATH = 'data/test.csv'

categories = ['toxic', 'severe_toxic', 'obscene', 'threat',  'insult', 'identity_hate']

### Preprocessing

TODO: import preprocessing notebook and use its functions

In [3]:
def load_data(path):
    df = pd.read_csv(path)
    return df

In [4]:
def data_split(X, Y):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RSEED)
    return X_train, X_test, y_train, y_test

In [5]:
# Basic text cleaning
# TODO: Replace with functions from preprocessing notebook
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [6]:
# load data
df = load_data(TRAIN_PATH)

In [7]:
# clean comment_text column
df['comment_text'] = df['comment_text'].map(lambda com : clean_text(com))

### Multi Label Logistic Regression


#### Setup and Train the Classifier

In [8]:
# Train Test Split
X = df['comment_text']
Y = df[categories]
X_train, X_test, y_train, y_test = data_split(X, Y)

In [9]:
# Initialize Logistic Regression Pipeline
multi_label_clf = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag', random_state=RSEED), n_jobs=1)),
            ])

In [10]:
# Train the classifier and make predictions
multi_label_clf.fit(X_train, y_train)
y_pred = multi_label_clf.predict(X_test)

# Multilabel classification report
print(f"Test Set Accuracy : {accuracy_score(y_test, y_pred) * 100} %\n\n") 
print(f"Classification Report : \n\n{classification_report(y_test, y_pred, zero_division=0)}") 

Test Set Accuracy : 91.81336523671168 %


Classification Report : 

              precision    recall  f1-score   support

           0       0.93      0.58      0.71      5083
           1       0.63      0.26      0.36       526
           2       0.93      0.60      0.73      2831
           3       0.71      0.14      0.24       152
           4       0.82      0.48      0.61      2643
           5       0.69      0.14      0.24       471

   micro avg       0.89      0.52      0.66     11706
   macro avg       0.78      0.37      0.48     11706
weighted avg       0.88      0.52      0.65     11706
 samples avg       0.05      0.04      0.05     11706



In [None]:
for i in range(len(categories)):
    print(i)
    print(multi_label_clf['clf'].estimators_[i])

### AUC - ROC Curve

In [None]:
# Train the classifier and calculate accuracy, f1 and roc_auc per category
accuracy_list = []
f1_list = []
roc_auc_scores = []
fpr_list = []
tpr_list = []
treshold_list = []


# Iterate over every category, train the model, predict, calculate scores
for category in categories:
    print('... Processing {}'.format(category))
    # train the model per category with X and y train
    multi_label_clf.fit(X_train, y_train[category])
    # class predictions per category
    prediction = multi_label_clf.predict(X_test)

    # Accuracy
    score = accuracy_score(y_test[category], prediction)
    accuracy_list.append(score)
    
    # F1 score
    f1 = f1_score(y_test[category], prediction)
    f1_list.append(f1)
    
    # ROC AUC Score
    y_score = multi_label_clf.fit(X_train, y_train[category]).predict_proba(X_test)
    fpr, tpr, tresh = roc_curve(y_test[category], y_score[:, 1], pos_label=1, drop_intermediate=False)
    fpr_list.append(fpr)
    tpr_list.append(tpr)
    treshold_list.append(tresh)
    roc_auc = roc_auc_score(y_test[category], y_score[:, 1])
    roc_auc_scores.append(roc_auc)
    
    print('----------------------')
    print('ROC AUC score is {}'.format(roc_auc))
    print('Accuracy is {}'.format(score))
    print('F1 Score is {}'.format(f1))
    print('\n')
    
# Print averages for Accuracy, F1, and ROC AUC 
print('... Average Scores')
print('-------------------')
avg_accuracy = sum(accuracy_list) / len(accuracy_list)
print('Average accuracy is {}'.format(avg_accuracy))

avg_roc_auc = sum(roc_auc_scores) / len(roc_auc_scores)
print('Average ROC AUC Score is {}'.format(avg_roc_auc))

avg_f1_score = sum(f1_list) / len(f1_list)
print('Average F1 Score is {}'.format(avg_f1_score))

In [None]:
# Plot ROC Curves for all categories
plt.subplots(1, figsize=(10,10))
plt.title('Receiver Operating Characteristic')
for i in range(len(fpr_list)):
    plt.plot(fpr_list[i], tpr_list[i], label= 'ROC curve (area = %0.4f) for %s' % (roc_auc_scores[i], categories[i]))
plt.plot([0, 1], ls="--", label='baseline')
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc="lower right")
plt.show()

### Create submission file

In [11]:
def create_submission_probabilities(clf, X_test):
    for category in categories:
        test_predictions = multi_label_clf.predict_proba(X_test)
        submissions_df[category] = test_predictions[:, 1]
        
    submissions_df.to_csv('data/submission.csv', index=False)

In [12]:
# Prepare Jigsaw Testset
test_df = load_data(TEST_PATH)
# Clean column with comments
test_df['comment_text'] = test_df['comment_text'].map(lambda com : clean_text(com))

# Define test data
X_test_submission = test_df['comment_text']

# Dataframe for predicted labels
submissions_df = test_df['id'].to_frame()

In [13]:
# Call if you want to calculate probabilities for kaggle test data and write them to submission.csv
create_submission_probabilities(multi_label_clf, X_test_submission)