## Toxic comment classification
### Import library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import log_loss,confusion_matrix,classification_report,roc_curve,auc
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_curve 
from sklearn.metrics import confusion_matrix, f1_score, fbeta_score, confusion_matrix

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split, cross_validate

from collections import Counter

import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from scipy import sparse

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import RandomOverSampler

#MAGIC
#plt.style.use('ggplot')
#%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
#%load_ext autoreload
#%autoreload 2

#WARNINGS
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
seed = 42
# import os
# os.environ['OMP_NUM_THREADS'] = '4'

## Read data set

In [2]:
#labeled data
toxic = pd.read_csv('toxicity_data/train.csv')
#test = pd.read_csv('toxicity_data/test.csv') # this test dataset doesn't have labels b/c it is from kaggle, delete?
print('Number of rows and columns in the train data set:',toxic.shape)
#print('Number of rows and columns in the test data set:',test.shape)

#unlabeled data
incel_df = pd.read_csv('new_IncelTears_posts.csv')
slate_df = pd.read_csv('new_slatestarcodex_posts.csv')

raw_toxic = toxic
small_toxic = toxic.sample(10000)
#small_toxic = toxic


target_col = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']
y = small_toxic[target_col]
y['sum'] = y.sum(axis=1).astype(bool).astype(int)


X_train, X_holdout, y_train, y_holdout = train_test_split(small_toxic, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(small_toxic, y, test_size=0.2, random_state=42)

### delete? Not probably going to use test.
# raw_test = test
# test = test.sample(5000)
# test.fillna(' ',inplace=True)

Number of rows and columns in the train data set: (159571, 8)


## Text preprosesing

[source: ](https://www.kaggle.com/him4318/easy-and-fast-lb-044) 

Term Frequency Inverse Document  Frequency Vectorizer 

In [3]:
vect_word = TfidfVectorizer(max_features=20000, lowercase=True, analyzer='word',
                        stop_words= 'english',ngram_range=(1,3),dtype=np.float32)
tr_vect = vect_word.fit_transform(X_train['comment_text'])
ts_vect = vect_word.transform(X_test['comment_text'])

incel_vect = vect_word.transform(incel_df['title'])
slate_vect = vect_word.transform(slate_df['title'])

#took 50 seconds on 150k samples

## Model baseline

In [4]:
def score_model(model, X_train, y_train):
    cv_results = cross_validate(model, X_train, y_train, cv=5, scoring='f1')
    return np.mean(cv_results['test_score'])

def model_baseline(X_train, y_train):
    """This takes in training and validation data and runs it through
    6 basic classification models and scores them based on recall"""

    lm2 = LogisticRegression(C=2,random_state = 42,class_weight = 'balanced') #all features
    lm2_score = score_model(lm2, X_train, y_train)

#     knn = KNeighborsClassifier(n_neighbors=5)
#     knn_score = score_model(knn, X_train, y_train)

#     gnb = GaussianNB()
#     gnb_score = score_model(gnb, X_train, y_train)

#     mnb = MultinomialNB()
#     mnb_score = score_model(mnb, X_train, y_train)

#     svm_model = svm.SVC()
#     svm_score = score_model(svm_model, X_train, y_train)

    rf = RandomForestClassifier(random_state=42)
    rf_score = score_model(rf, X_train, y_train)

    res = {
        'Logistic regression': lm2_score,
#         'KNN': knn_score,
#         'Gaussian Naive Bayes': gnb_score,
#         'Multinomial Naive Bayes': mnb_score,
#         'Support Vector Classifier': svm_score,
         'Random Forest': rf_score
    }
    return res


In [5]:
model_baseline(tr_vect, y_train['sum'])

{'Logistic regression': 0.6594241227008731,
 'Random Forest': 0.6106933256925536}

## Class imbalance

In [8]:
#Random OverSampler
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_sample(tr_vect,y_train['sum'])
# X_resampled_df = pd.DataFrame(X_resampled)
# model_baseline(X_resampled_df, y_resampled)

model_baseline(X_resampled, y_resampled)

{'Logistic regression': 0.9809860392638658,
 'Random Forest': 0.9755760390495614}

Class imbalance fixing boosts score 30%!