# toxic comments classification
## by Karin Brisker





imports

In [1]:
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import numpy as np
import re
import os
import time

from sklearn.naive_bayes import MultinomialNB
import sklearn

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier


from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier

import itertools
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem import PorterStemmer

import warnings
warnings.filterwarnings('ignore')

  from numpy.core.umath_tests import inner1d


In [2]:
# load data
train_df = pd.read_csv('./Data/train.csv')
test_df = pd.read_csv('./Data/test.csv')
print('Train shape: ', train_df.shape)
print('Test shape: ', test_df.shape) 

Train shape:  (159571, 8)
Test shape:  (153164, 2)


In [3]:
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


## cleaning text

In [4]:
def cleanData(text, remove_stops=False, stemming=False, lemmatization=False):
    text = text.lower().split()
    text = " ".join(text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+\-=]", " ", text)

    # Replace apostrophes with standard lexicons
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ", text)
    text = re.sub(r"\+", " ", text)
    text = re.sub(r"\-", " ", text)
    text = re.sub(r"\=", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"isn't", "is not", text)
    text = re.sub(r"aren't", "are not", text)
    text = re.sub(r"ain't", "am not", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"didn't", "did not", text)
    text = re.sub(r"shan't", "shall not", text)
    text = re.sub(r"haven't", "have not", text)
    text = re.sub(r"hadn't", "had not", text)
    text = re.sub(r"hasn't", "has not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"wasn't", "was not", text)
    text = re.sub(r"weren't", "were not", text)
    text = re.sub(r"doesn't", "does not", text)
    text = re.sub(r"'s", " is", text)
    text = re.sub(r"'re", " are", text)
    text = re.sub(r"'m", " am", text)
    text = re.sub(r"'d", " would", text)
    text = re.sub(r"'ll", " will", text)
    text = re.sub(r"--th", " ", text)
    text = re.sub('[()\"\t_\n.,:=!@#$%^&*-/[\]?|1234567890—]', ' ', text)

    # More cleaning
    text = re.sub(r"alot", "a lot", text)
    text = re.sub(r"what's", "", text)
    text = re.sub(r"What's", "", text)

    # Remove urls and emails
    text = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', text, flags=re.MULTILINE)
    text = re.sub(r'[\w\.-]+@[\w\.-]+', ' ', text, flags=re.MULTILINE)

    # Replace words like sooooooo with so
    text = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))

    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])

    # Remove all symbols
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+\-=]", " ", text)
    text = re.sub(r'[^A-Za-z\s]', r' ', text)
    text = re.sub(r'\n', r' ', text)
    text = re.sub('[()\"\t_\n.,:=!@#$%^&*-/[\]?|1234567890—]', ' ', text)

    text = re.sub(r'\d +', ' ', text)

    if remove_stops:
        text = " ".join([w for w in text.split() if w not in stop_words])

    if stemming:
        st = PorterStemmer()
        text = " ".join([st.stem(w) for w in text.split()])

    if lemmatization:
        wordnet_lemmatizer = WordNetLemmatizer()
        text = " ".join([wordnet_lemmatizer.lemmatize(w, pos='v') for w in text.split()])

    return text

special_character_removal=re.compile(r'[^a-z\d ]',re.IGNORECASE)
replace_numbers=re.compile(r'\d+',re.IGNORECASE)

def text_to_wordlist(text):
    text = text.lower().split() 
    text = " ".join(text)
    text=special_character_removal.sub('',text)
    text=replace_numbers.sub('n',text)
    return(text)

clean data

In [5]:
train_df['comment_text'] = train_df['comment_text'].map(lambda x: cleanData(x))
test_df['comment_text'] = test_df['comment_text'].map(lambda x: cleanData(x))

list_sentences_train = train_df["comment_text"].fillna(" ").values
list_sentences_test = test_df["comment_text"].fillna(" ").values

train_df["comment_text"] = pd.Series([text_to_wordlist(text) for text in list_sentences_train]).fillna(" ")
test_df["comment_text"] = pd.Series([text_to_wordlist(text) for text in list_sentences_test]).fillna(" ")

all_df = pd.concat([train_df["comment_text"], test_df["comment_text"]])

In [6]:
train_df.to_pickle('train.pkl')
test_df.to_pickle('test.pkl')

In [9]:
train_df = pd.read_pickle('train.pkl')
test_df = pd.read_pickle('test.pkl')

t.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,d aww he matches this background colour i am s...,0,0,0,0,0,0
2,000113f07ec002fd,hey man i am really not trying to edit war it ...,0,0,0,0,0,0
3,0001b41b1c6bb37e,more i cannot make any real suggestions on imp...,0,0,0,0,0,0
4,0001d958c54c6e35,you sir are my hero any chance you remember wh...,0,0,0,0,0,0


In [10]:
labels = list(train_df.columns[2:])

y_train = train_df[labels].values
y_test = train_df[labels].values

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,1), norm='l2')
vectorizer.fit(all_df)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [12]:
X_train = vectorizer.transform(train_df['comment_text'])
X_test = vectorizer.transform(test_df['comment_text'])


## models

define models

In [13]:
models = ['MultinomialNB', 'LogisticRegression', 'ExtraTreesClassifier']

model by name

In [14]:
def get_model(name):
    if name == 'MultinomialNB':
        return MultinomialNB()
    elif name == 'LogisticRegression':
        return LogisticRegression(solver='sag')
    elif name == 'ExtraTreesClassifier':
        return ExtraTreesClassifier(n_estimators=10)
    return

train & predict

save models predictions

In [15]:
id_df = pd.DataFrame({'id':test_df.id})
for model in models:
    print('\n ############# \n')
    print(f'\n {model} results: \n')
    new = pd.DataFrame(0, index=np.arange(test_df.shape[0]), columns=labels)
    for category in labels:
        model_x = get_model(model)
        model_x.fit(X_train, train_df[category])
        pred = list(model_x.predict(X_test))
        print(pred.count(1))
        new[category] = pred
        accuracy = model_x.score(X_train, train_df[category])
        print(f"Accuracy For {category} Class Is {round(accuracy*100,2)}%")
    p_df = pd.DataFrame(new, columns=labels)
    p_df_id = pd.concat([id_df, p_df], axis=1)
    p_df_id.to_csv(str('ccc') + '.csv', index=False)


 ############# 


 MultinomialNB results: 

4814
Accuracy For toxic Class Is 91.71%
285
Accuracy For severe_toxic Class Is 99.0%
1809
Accuracy For obscene Class Is 95.09%
147
Accuracy For threat Class Is 99.7%
1057
Accuracy For insult Class Is 95.18%
268
Accuracy For identity_hate Class Is 99.12%

 ############# 


 LogisticRegression results: 

22438
Accuracy For toxic Class Is 96.2%
933
Accuracy For severe_toxic Class Is 99.13%
12196
Accuracy For obscene Class Is 97.92%
162
Accuracy For threat Class Is 99.73%
8871
Accuracy For insult Class Is 97.37%
695
Accuracy For identity_hate Class Is 99.24%

 ############# 


 ExtraTreesClassifier results: 

15052
Accuracy For toxic Class Is 99.97%
293
Accuracy For severe_toxic Class Is 99.98%
8715
Accuracy For obscene Class Is 99.98%
51
Accuracy For threat Class Is 99.99%
5213
Accuracy For insult Class Is 99.96%
282
Accuracy For identity_hate Class Is 99.99%


ensemble model

In [None]:
id_df = pd.DataFrame({'id':test_df.id})
from sklearn.ensemble import VotingClassifier

e_pred = pd.DataFrame(0, index=np.arange(test_df.shape[0]), columns=labels)
eclf1 = VotingClassifier(estimators=[('MultinomialNB', MultinomialNB()), ('LogisticRegression', LogisticRegression(solver='sag')), ('ExtraTreesClassifier', ExtraTreesClassifier(n_estimators=20))], voting='hard')
for category in labels:
    eclf1 = eclf1.fit(X_train, train_df[category])
    pred = list(eclf1.predict(X_test))
    print(pred.count(1))
    e_pred[category] = pred
    accuracy = eclf1.score(X_train, train_df[category])
    print(f"Accuracy For {category} Class Is {round(accuracy*100,2)}%")
p_df = pd.DataFrame(e_pred, columns=labels)
p_df_id = pd.concat([id_df, p_df], axis=1)
p_df_id.to_csv('ensemble.csv', index=False)