## Toxic comment classification
### Import library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import log_loss,confusion_matrix,classification_report,roc_curve,auc

import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from scipy import sparse
%matplotlib inline
seed = 42
import os
os.environ['OMP_NUM_THREADS'] = '4'

## Read data set

In [2]:
#labeled data
train = pd.read_csv('toxicity_data/train.csv')
test = pd.read_csv('toxicity_data/test.csv')
print('Number of rows and columns in the train data set:',train.shape)
print('Number of rows and columns in the test data set:',test.shape)

#unlabeled data
incel_df = pd.read_csv('new_IncelTears_posts.csv')
slate_df = pd.read_csv('new_slatestarcodex_posts.csv')

Number of rows and columns in the train data set: (159571, 8)
Number of rows and columns in the test data set: (153164, 2)


In [3]:
raw_train = train
train = train.sample(5000)
raw_test = test
test = test.sample(5000)
test.fillna(' ',inplace=True)

## Text preprosesing

[source: ](https://www.kaggle.com/him4318/easy-and-fast-lb-044) 

Term Frequency Inverse Document  Frequency Vectorizer 

In [3]:
vect_word = TfidfVectorizer(max_features=20000, lowercase=True, analyzer='word',
                        stop_words= 'english',ngram_range=(1,3),dtype=np.float32)
tr_vect = vect_word.fit_transform(train['comment_text'])
ts_vect = vect_word.transform(test['comment_text'])

incel_vect = vect_word.transform(incel_df['title'])
slate_vect = vect_word.transform(slate_df['title'])

target_col = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']
y = train[target_col]

## LR Model

In [7]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
y_test = test[target_col]

KeyError: "['toxic' 'severe_toxic' 'obscene' 'threat' 'insult' 'identity_hate'] not in index"

In [8]:
lr = LogisticRegression(C=2,random_state = 42,class_weight = 'balanced')
lr.fit(tr_vect,y['insult'])

col = 'insult'
print("Column:",col)
pred =  lr.predict(tr_vect)
print('\nConfusion matrix\n',confusion_matrix(y['insult'],pred))
print(classification_report(y['threat'],pred))

Column: insult

Confusion matrix
 [[146107   5587]
 [    95   7782]]
              precision    recall  f1-score   support

           0       1.00      0.92      0.96    159093
           1       0.03      0.77      0.05       478

   micro avg       0.92      0.92      0.92    159571
   macro avg       0.51      0.85      0.51    159571
weighted avg       1.00      0.92      0.95    159571



## Take a look at the negative Incel Subreddit

In [7]:
incel_preds = lr.predict(incel_vect)
print(f'Percentage of Incel titles predicted as insults {incel_preds.sum()/incel_preds.shape[0]}')

Percentage of Incel titles predicted as insults 0.06018054162487462


In [8]:
incel_df[np.isin(incel_preds, 0)]['title'].values #these are the ones it said were ok.

array(['Another one thinking he is a genious', '"But were nonviolent"',
       'Probably a LARP as I imagine the only thing he actually lifts is Cheeto packets. However if true, I hope the next woman be tries it with gives him what he deserves...',
       'Because of course he’s entitled to a woman’s body if someone else has had it.',
       'They bring so much of their unhappiness upon themselves',
       'incel worried about an epidemic of "open mouthed skinny framed" guys',
       '"clothes. But her bone structure is terrible, if she was born male, she would be extremely repulsive and for sure involuntary adult virgin"',
       'An incels view on feminism. “Recriminalize homosexual acts publicly...” and “publicly humiliate them whilst this punishment takes place.”',
       '“Females are the problem, not males.”',
       'Incel goes outside and overhears a conversation that had nothing to do with him and takes personal offense.',
       'Incel blaming genetics while at the same time 

Still some pretty bad stuff getting missed.

## Let's look at  a more supportive subreddit

In [9]:
slate_preds = lr.predict(slate_vect)
print(f'Percentage of Slate titles predicted as insults {slate_preds.sum()/slate_preds.shape[0]}')

Percentage of Slate titles predicted as insults 0.0050150451354062184


In [10]:
(incel_preds.sum()/incel_preds.shape[0])/(slate_preds.sum()/slate_preds.shape[0]) #good subreddit 13x better.

12.0