## Toxic comment classification
### Import library

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import log_loss,confusion_matrix,classification_report,roc_curve,auc
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_curve 
from sklearn.metrics import confusion_matrix, f1_score, fbeta_score, confusion_matrix

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split, cross_validate

from collections import Counter

import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from scipy import sparse

#WARNINGS
import warnings
warnings.filterwarnings('ignore')

## Read data set

In [3]:
toxic = pd.read_csv('toxicity_data/train.csv') #there's also a test dataset but it doesn't have labels b/c kaggle.
print('Number of rows and columns in the train data set:',toxic.shape)

#unlabeled data
incel_df = pd.read_csv('new_IncelTears_posts.csv')
slate_df = pd.read_csv('new_slatestarcodex_posts.csv')

raw_toxic = toxic
small_toxic = toxic.sample(50000) # this decreases time from 50 seconds to less
#small_toxic = toxic # this is just so I don't have to rename rest of cells

#turn multi-class into single class classifier
target_col = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']
y = small_toxic[target_col]
y['sum'] = y.sum(axis=1).astype(bool).astype(int) 

## split data
X_train, X_holdout, y_train, y_holdout = train_test_split(small_toxic, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(small_toxic, y, test_size=0.2, random_state=42)

Number of rows and columns in the train data set: (159571, 8)


## Text preprocessing - TF-IDF up to trigrams

In [4]:
vect_word = TfidfVectorizer(max_features=20000, lowercase=True, analyzer='word',
                        stop_words= 'english',ngram_range=(1,3),dtype=np.float32)
tr_vect = vect_word.fit_transform(X_train['comment_text'])
ts_vect = vect_word.transform(X_test['comment_text'])

incel_vect = vect_word.transform(incel_df['title'])
slate_vect = vect_word.transform(slate_df['title'])

#took 50 seconds on 150k samples

## Model baseline

In [5]:
def score_model(model, X_train, y_train):
    cv_results = cross_validate(model, X_train, y_train, cv=5, scoring='f1')
    return np.mean(cv_results['test_score'])

def model_baseline(X_train, y_train):
    """This takes in training and validation data and runs it through
    6 basic classification models and scores them based on recall"""

    lm2 = LogisticRegression(C=2,random_state = 42,class_weight = 'balanced') #all features
    lm2_score = score_model(lm2, X_train, y_train)

    rf = RandomForestClassifier(random_state=42)
    rf_score = score_model(rf, X_train, y_train)

    res = {
        'Logistic regression': lm2_score,
         'Random Forest': rf_score
    }
    return res

model_baseline(tr_vect, y_train['sum'])

{'Logistic regression': 0.7203462169769782,
 'Random Forest': 0.7106319027294214}

In [6]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1)

clf.fit(tr_vect,y_train['sum'])


MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [7]:
pred =  clf.predict(ts_vect)
f1_score(y_test['sum'],pred) #why 0, after fixing class imbalance scores about the same.

0.0

## Class imbalance

In [8]:
#Random OverSampler - about 3x size of dataset
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_sample(tr_vect,y_train['sum'])
# X_resampled_df = pd.DataFrame(X_resampled)
# model_baseline(X_resampled_df, y_resampled)

model_baseline(X_resampled, y_resampled)

NameError: name 'RandomOverSampler' is not defined

Class imbalance fixing boosts score 30%!

Random oversampling seems to do best.

## Let's use this to predict our good and bad subreddits

In [9]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_resampled, y_resampled)
pred =  rf.predict(ts_vect)
rf_f1 = f1_score(y_test['sum'],pred)

lr = LogisticRegression(C=2,random_state = 42,class_weight = 'balanced')
lr.fit(X_resampled, y_resampled)
pred =  lr.predict(ts_vect)
lr_f1 = f1_score(y_test['sum'],pred)

print (f'LR F1={lr_f1}, RF F1={rf_f1}')

LR F1=0.7404616109279322, RF F1=0.6480086114101185


In [10]:
incel_preds = lr.predict(incel_vect)
print(f'Percentage of Incel titles predicted as toxic {incel_preds.sum()/incel_preds.shape[0]}')

slate_preds = lr.predict(slate_vect)
print(f'Percentage of Slate titles predicted as toxic {slate_preds.sum()/slate_preds.shape[0]}')

score_ratio = (incel_preds.sum()/incel_preds.shape[0])/(slate_preds.sum()/slate_preds.shape[0]) #good subreddit 13x better.
print (f'Slate is {round(score_ratio,2)}x better)')

Percentage of Incel titles predicted as toxic 0.2637913741223671
Percentage of Slate titles predicted as toxic 0.06018054162487462
Slate is 4.38x better)


In [11]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1)

clf.fit(X_resampled, y_resampled)

pred =  clf.predict(ts_vect)
f1_score(y_test['sum'],pred)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

0.7357107962872497