## Toxic comment classification
### Import library

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import log_loss,confusion_matrix,classification_report,roc_curve,auc
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_curve 
from sklearn.metrics import confusion_matrix, f1_score, fbeta_score, confusion_matrix

from sklearn.model_selection import train_test_split, cross_validate

from collections import Counter

import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from scipy import sparse

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import RandomOverSampler

#WARNINGS
import warnings
warnings.filterwarnings('ignore')

from pymongo import MongoClient, InsertOne, DeleteOne, ReplaceOne

## Read data set

In [8]:
toxic = pd.read_csv('toxicity_data/train.csv') #there's also a test dataset but it doesn't have labels b/c kaggle.
print('Number of rows and columns in the train data set:',toxic.shape)

#unlabeled data
incel_df = pd.read_csv('new_IncelTears_posts.csv')
slate_df = pd.read_csv('new_slatestarcodex_posts.csv')

raw_toxic = toxic
small_toxic = toxic #can add a .sample to make things run quicker here.

#turn multi-class into single class classifier
target_col = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']
y = small_toxic[target_col]
y['sum'] = y.sum(axis=1).astype(bool).astype(int) 

# try undersampling
small_toxic['target']=y['sum']
neg_sample = small_toxic[small_toxic['target']==0].sample(16000)
pos_sample = small_toxic[small_toxic['target']==1].sample(16000)
all_df = pd.concat([neg_sample,pos_sample])

# this is for undersampling
X_train, X_holdout, y_train, y_holdout = train_test_split(all_df.drop('target',axis=1), all_df['target'], test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

Number of rows and columns in the train data set: (159571, 8)


## Text preprocessing - TF-IDF up to trigrams

In [9]:
vect_word = TfidfVectorizer(max_features=20000, lowercase=True, analyzer='word',
                        stop_words= 'english',ngram_range=(1,3),dtype=np.float32)
tr_vect = vect_word.fit_transform(X_train['comment_text'])
ts_vect = vect_word.transform(X_test['comment_text'])
holdout_vect = vect_word.transform(X_holdout['comment_text'])

incel_vect = vect_word.transform(incel_df['title'])
slate_vect = vect_word.transform(slate_df['title'])
#took 30-50 seconds on 150k samples

## Score on undersampled data

In [10]:
def score_model(model, X_train, y_train):
    cv_results = cross_validate(model, X_train, y_train, cv=5, scoring='f1')
    return np.mean(cv_results['test_score'])

def model_baseline(X_train, y_train):
    """This takes in training and validation data and runs it through
    6 basic classification models and scores them based on recall"""

    lm2 = LogisticRegression(C=2,random_state = 42,class_weight = 'balanced') #all features
    lm2_score = score_model(lm2, X_train, y_train)

    rf = RandomForestClassifier(random_state=42)
    rf_score = score_model(rf, X_train, y_train)

    res = {
        'Logistic regression': lm2_score,
         'Random Forest': rf_score
    }
    return res

model_baseline(tr_vect, y_train)

{'Logistic regression': 0.8820298077668852,
 'Random Forest': 0.8275651639316501}

In [11]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1)

clf.fit(tr_vect, y_train)

pred =  clf.predict(ts_vect)
f1_score(y_test,pred)

0.8777429467084639

## Class imbalance

In [28]:
## Get new imbalanced.
X_train, X_holdout, y_train, y_holdout = train_test_split(small_toxic.drop('target',axis=1), small_toxic['target'], test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [19]:
vect_word = TfidfVectorizer(max_features=20000, lowercase=True, analyzer='word',
                        stop_words= 'english',ngram_range=(1,3),dtype=np.float32)
tr_vect = vect_word.fit_transform(X_train['comment_text'])
ts_vect = vect_word.transform(X_test['comment_text'])
holdout_vect = vect_word.transform(X_holdout['comment_text'])

incel_vect = vect_word.transform(incel_df['title'])
slate_vect = vect_word.transform(slate_df['title'])
#took 30-50 seconds on 150k samples

In [20]:
non_toxic_percent = len(y_train[y_train==0])/y_train.shape[0]
print (f'{round(non_toxic_percent,3)}% of my data is non-toxic')

0.899% of my data is non-toxic


In [21]:
def score_rf_lr(x,y):
    rf = RandomForestClassifier(random_state=42)
    rf.fit(x, y)
    pred =  rf.predict(ts_vect)
    rf_f1 = f1_score(y_test,pred)

    lr = LogisticRegression(C=2,random_state = 42,class_weight = 'balanced')
    lr.fit(x, y)
    pred =  lr.predict(ts_vect)
    lr_f1 = f1_score(y_test,pred)

    print (f'LR F1={lr_f1}, RF F1={rf_f1}')
    return lr_f1, rf_f1

In [22]:
#Random OverSampler - about 3x size of dataset
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_sample(tr_vect,y_train)
# X_resampled_df = pd.DataFrame(X_resampled)
# model_baseline(X_resampled_df, y_resampled)

score_rf_lr(X_resampled, y_resampled)

LR F1=0.731763925729443, RF F1=0.6665294359818855


(0.731763925729443, 0.6665294359818855)

In [23]:
X_smoted, y_smoted = SMOTE(random_state=42).fit_sample(tr_vect,y_train)
score_rf_lr(X_smoted,y_smoted)

LR F1=0.6598218330061906, RF F1=0.6222910216718267


(0.6598218330061906, 0.6222910216718267)

In [24]:
X_adasyn, y_adasyn = ADASYN(random_state=42).fit_sample(tr_vect,y_train)
score_rf_lr(X_adasyn, y_adasyn)

LR F1=0.6448326055312954, RF F1=0.5999241562381494


(0.6448326055312954, 0.5999241562381494)

Surprisingly, none of these methods help.