## Toxic comment classification
### Import library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import log_loss,confusion_matrix,classification_report,roc_curve,auc
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_curve 
from sklearn.metrics import confusion_matrix, f1_score, fbeta_score, confusion_matrix

from sklearn.model_selection import train_test_split, cross_validate

from collections import Counter

import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from scipy import sparse

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import RandomOverSampler

#WARNINGS
import warnings
warnings.filterwarnings('ignore')

from pymongo import MongoClient, InsertOne, DeleteOne, ReplaceOne

## Read data set

In [2]:
toxic = pd.read_csv('toxicity_data/train.csv') #there's also a test dataset but it doesn't have labels b/c kaggle.
print('Number of rows and columns in the train data set:',toxic.shape)

#unlabeled data
incel_df = pd.read_csv('new_IncelTears_posts.csv')
slate_df = pd.read_csv('new_slatestarcodex_posts.csv')

raw_toxic = toxic
small_toxic = toxic #can add a .sample to make things run quicker here.

#turn multi-class into single class classifier
target_col = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']
y = small_toxic[target_col]
y['sum'] = y.sum(axis=1).astype(bool).astype(int) 

# try undersampling
small_toxic['target']=y['sum']
neg_sample = small_toxic[small_toxic['target']==0].sample(16000)
pos_sample = small_toxic[small_toxic['target']==1].sample(16000)
all_df = pd.concat([neg_sample,pos_sample])

# ##Original code, doesn't do as well as undersampled code -- in case you want to try it.
# X_train, X_holdout, y_train, y_holdout = train_test_split(small_toxic.drop('target',axis=1), small_toxic['target'], test_size=0.2, random_state=42)
# X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# this is for undersampling
X_train, X_holdout, y_train, y_holdout = train_test_split(all_df.drop('target',axis=1), all_df['target'], test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

Number of rows and columns in the train data set: (159571, 8)


## Text preprocessing - TF-IDF up to trigrams

In [3]:
vect_word = TfidfVectorizer(max_features=20000, lowercase=True, analyzer='word',
                        stop_words= 'english',ngram_range=(1,3),dtype=np.float32)
tr_vect = vect_word.fit_transform(X_train['comment_text'])
ts_vect = vect_word.transform(X_test['comment_text'])
holdout_vect = vect_word.transform(X_holdout['comment_text'])

incel_vect = vect_word.transform(incel_df['title'])
slate_vect = vect_word.transform(slate_df['title'])
#took 30-50 seconds on 150k samples

## Model baseline

In [4]:
def score_model(model, X_train, y_train):
    cv_results = cross_validate(model, X_train, y_train, cv=5, scoring='f1')
    return np.mean(cv_results['test_score'])

def model_baseline(X_train, y_train):
    """This takes in training and validation data and runs it through
    6 basic classification models and scores them based on recall"""

    lm2 = LogisticRegression(C=2,random_state = 42,class_weight = 'balanced') #all features
    lm2_score = score_model(lm2, X_train, y_train)

    rf = RandomForestClassifier(random_state=42)
    rf_score = score_model(rf, X_train, y_train)

    res = {
        'Logistic regression': lm2_score,
         'Random Forest': rf_score
    }
    return res

model_baseline(tr_vect, y_train)

{'Logistic regression': 0.884286176528254, 'Random Forest': 0.8233909091930197}

Undersampling results: {'Logistic regression': 0.8839971555155985,
 'Random Forest': 0.8198122061915403}  
Normal results: {'Logistic regression': 0.7273105816751935, 'Random Forest': 0.728039714039031}

In [5]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1)

clf.fit(tr_vect, y_train)

pred =  clf.predict(ts_vect)
f1_score(y_test,pred)

0.8846379074760883

Basic neural net also did well at 88% f1

In [6]:
#RUN ONCE TO SAVE MODEL
# import pickle
# with open('lr_undersampled_model.pickle', 'wb') as handle:
#      pickle.dump(lr, handle, protocol=pickle.HIGHEST_PROTOCOL)

#RUN ONCE TO SAVE FIT VECTORIZER
# with open('fit_undersampled_vect.pickle', 'wb') as handle:
#      pickle.dump(vect_word, handle, protocol=pickle.HIGHEST_PROTOCOL)


## Score on holdout

In [7]:
lr = LogisticRegression(C=2,random_state = 42, class_weight = 'balanced')
lr.fit(tr_vect, y_train)
pred =  lr.predict(holdout_vect)
f1_score(y_holdout,pred)

0.886873350923483

## Donald

In [17]:
donald_df = pd.read_csv('new_The_Donald_posts.csv')
donald_vect = vect_word.transform(donald_df['title'])
donald_preds =  lr.predict(donald_vect)
print(f'Percentage of The_Donald titles predicted as toxic {donald_preds.sum()/donald_preds.shape[0]}')

Percentage of The_Donald titles predicted as toxic 0.27062374245472837


In [21]:
donald_df[np.isin(donald_preds, 1)]['title'].values[:25] #these are the ones it said were not okay.

array(['Liar Liar Pants on fire', 'Know thy enemy',
       'Kamala Harris’s dad: Our family wants to ‘dissociate ourselves from this travesty’',
       'Prosecutor: Jussie Smollett Faces Up To 3 Years in Prison',
       "It's almost like the national media isn't interested in certain hate crimes, like this for some reason. I can't quite put my finger on it...",
       "$6million? Thats a lot of people who won't be getting refunds...",
       'GEOTUS does not get enough credit for calling ISIS, “ISIS” and not “ISIL” like Malik’s brother.',
       'The White House on Twitter: "When America can’t vet who crosses its borders, our citizens, including our legal immigrants, pay the price. This crisis is real—and pretending it doesn’t exist is an insult to those who live with its consequences every day."',
       'LOL',
       "Can we get some love for Glenn Greenwald. He tells it like it is and doesn't care if feathers get ruffled.",
       'Geesus. What a incredible woman.', 'CAUGHT IN A SMO

## Politics

In [23]:
client = MongoClient()
db = client["reddit"]
titles_collection = db.get_collection('titles')
overnight_reddit_collection = db.get_collection('overnight_reddit')
reddit_overnight_collection = db.get_collection('reddit_overnight') 

In [24]:
subreddit = 'politics'
politics_submissions = list(titles_collection.find({'subreddit':subreddit}))
politics_text = np.array([i['title'] for i in politics_submissions])
politics_vect = vect_word.transform(politics_text)
politics_preds = lr.predict(politics_vect)
print(f'Percentage of Politics titles predicted as toxic {politics_preds.sum()/politics_preds.shape[0]}')
politics_text[np.isin(politics_preds, 1)][:25] #these are the ones it said were not okay.

Percentage of Politics titles predicted as toxic 0.1443013698630137


### Let's generalize this

In [34]:
def subredddit_toxicity_percent(subreddit):
    sub_submissions = list(titles_collection.find({'subreddit':subreddit}))
    sub_text = np.array([i['title'] for i in sub_submissions])
    sub_vect = vect_word.transform(sub_text)
    sub_preds = lr.predict(sub_vect)
    print(f'Percentage of {subreddit} titles predicted as toxic {politics_preds.sum()/politics_preds.shape[0]}')
    twenty_five_samples = sub_text[np.isin(sub_preds, 1)][:25]
    print(twenty_five_samples) #these are the ones it said were not okay.
    return twenty_five_samples

In [35]:
subredddit_toxicity_percent('mylittlepony')

Percentage of mylittlepony titles predicted as toxic 0.1443013698630137
['Equestria Girls Rainbow Dash in Gala Dress' 'Shame'
 'The Real Pinkie Pie by Adlynh on DeviantArt'
 'Applejack Dressed Like Samus'
 'Pony Puppet Theater (Pilot) by MangaMeister on DeviantArt'
 "'Told you you would like it, dear' by aJVL"
 'Pony Puppet Theater #6 Pony Rumors by MangaMeister on DeviantArt'
 'Without a Care' 'Child of Light' 'Hey Applejack! by Aureai'
 'Princess Luna at insane speeds.' 'Huh? by wandrevieira1994'
 '"I\'m Gonna Marry The Princess!" by dm29'
 'Applejack the Awesome Alicorn!'
 "Those faces..They're just so..mesmerizing.."
 'A Typical Princess Morning' 'Got Your Nose'
 'Fluttershy has an evil side' "(RariJack Daily) What's Wrong?"
 'A mugs life by pepooni' 'Vigors Are Your Friends, RD by Underpable'
 "Everyone of you guys matters, don't let anyone change that"
 'Oh how the tables have turned...' 'Rarijack-Daily: Reading!'
 'Octavia helping me to not afk out while waiting on Poundfist to 

array(['Equestria Girls Rainbow Dash in Gala Dress', 'Shame',
       'The Real Pinkie Pie by Adlynh on DeviantArt',
       'Applejack Dressed Like Samus',
       'Pony Puppet Theater (Pilot) by MangaMeister on DeviantArt',
       "'Told you you would like it, dear' by aJVL",
       'Pony Puppet Theater #6 Pony Rumors by MangaMeister on DeviantArt',
       'Without a Care', 'Child of Light', 'Hey Applejack! by Aureai',
       'Princess Luna at insane speeds.', 'Huh? by wandrevieira1994',
       '"I\'m Gonna Marry The Princess!" by dm29',
       'Applejack the Awesome Alicorn!',
       "Those faces..They're just so..mesmerizing..",
       'A Typical Princess Morning', 'Got Your Nose',
       'Fluttershy has an evil side', "(RariJack Daily) What's Wrong?",
       'A mugs life by pepooni',
       'Vigors Are Your Friends, RD by Underpable',
       "Everyone of you guys matters, don't let anyone change that",
       'Oh how the tables have turned...', 'Rarijack-Daily: Reading!',
       'Oct

## Class imbalance

In [23]:
non_toxic_percent = len(y_train[y_train['sum']==0]['sum'])/y_train.shape[0]
print (f'{round(non_toxic_percent,3)}% of my data is non-toxic')

0.898% of my data is non-toxic


In [25]:
def score_rf_lr(x,y):
    rf = RandomForestClassifier(random_state=42)
    rf.fit(x, y)
    pred =  rf.predict(ts_vect)
    rf_f1 = f1_score(y_test['sum'],pred)

    lr = LogisticRegression(C=2,random_state = 42,class_weight = 'balanced')
    lr.fit(x, y)
    pred =  lr.predict(ts_vect)
    lr_f1 = f1_score(y_test['sum'],pred)

    print (f'LR F1={lr_f1}, RF F1={rf_f1}')
    return lr_f1, rf_f1

In [26]:
#Random OverSampler - about 3x size of dataset
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_sample(tr_vect,y_train['sum'])
# X_resampled_df = pd.DataFrame(X_resampled)
# model_baseline(X_resampled_df, y_resampled)

score_rf_lr(X_resampled, y_resampled)

LR F1=0.7404616109279322, RF F1=0.6480086114101185


(0.7404616109279322, 0.6480086114101185)

Class imbalance fixing boosts score 30%!

In [27]:
X_smoted, y_smoted = SMOTE(random_state=42).fit_sample(tr_vect,y_train['sum'])
score_rf_lr(X_smoted,y_smoted)

LR F1=0.6177054386661374, RF F1=0.5985169491525424


(0.6177054386661374, 0.5985169491525424)

In [28]:
X_adasyn, y_adasyn = ADASYN(random_state=42).fit_sample(tr_vect,y_train['sum'])
score_rf_lr(X_adasyn, y_adasyn)

LR F1=0.6068540623796689, RF F1=0.6101694915254238


(0.6068540623796689, 0.6101694915254238)

Random oversampling seems to do best.

## Let's use this to predict our good and bad subreddits

In [24]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_resampled, y_resampled)
pred =  rf.predict(ts_vect)
rf_f1 = f1_score(y_test['sum'],pred)

lr = LogisticRegression(C=2,random_state = 42,class_weight = 'balanced')
lr.fit(X_resampled, y_resampled)
pred =  lr.predict(ts_vect)
lr_f1 = f1_score(y_test['sum'],pred)

print (f'LR F1={lr_f1}, RF F1={rf_f1}')

In [10]:
incel_preds = lr.predict(incel_vect)
print(f'Percentage of Incel titles predicted as toxic {incel_preds.sum()/incel_preds.shape[0]}')

slate_preds = lr.predict(slate_vect)
print(f'Percentage of Slate titles predicted as toxic {slate_preds.sum()/slate_preds.shape[0]}')

score_ratio = (incel_preds.sum()/incel_preds.shape[0])/(slate_preds.sum()/slate_preds.shape[0]) #good subreddit 13x better.
print (f'Slate is {round(score_ratio,2)}x better)')

Percentage of Incel titles predicted as toxic 0.2637913741223671
Percentage of Slate titles predicted as toxic 0.06018054162487462
Slate is 4.38x better)
