# Introduction
* This notebook contains an NLP sentiment classification of comments into several class labels of several types of verbal violence.
* The dataset is from kaggle: https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge.
* This notebook provides a linear baseline for this contest.
* The prediction that I submitted to kaggle got an avg AUC score of 0.83514.


### The notebook is divided into the following parts:
* Part 1: Notebook Preparation
    * Import
    * Config
    * Functions
* Part 2: Data Processing
    * Basic analysis
    * Word tokenization
    * Text stemming and lemmatization
    * Build a bag of words and remove stop words
    * TF-IDF
    * Organize the words as feature columns in a data frame
    * Check if the presence of a word in a comment is indicative
* Part 3: Modeling
    * Dimension reduction - PCA
    * Fit the model to the data
    * Predict the labels of the test dataset

# Part 1: Notebook Preparation

## Import

In [1]:
# Data analysis
import pandas as pd
import numpy as np
import re
import collections
from collections import Counter

# NLP
from nltk.stem import PorterStemmer
porter = PorterStemmer()

# Machine learning
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
    
%autosave 60

Autosaving every 60 seconds


## Config

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

labels_list = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# I added the word '_label' to each label name, so it won't overlap with real words in the comments.
labels_dict = {label:f'{label}_label' for label in labels_list}
new_label_list = list(labels_dict.values())

## Functions

In [3]:
def cleanText(txt):
    """
    This function performs tokenization for the text.
    It removes forbidden symbols, and replace digits with the token '<num>'.
    It also replaces the character '\n' with a space, 
    and adds spaces near the '<','>' symbols, to make sure it won't be near a real word.
    In addition, I transpose all text to lower case.
    """
    
    digits = re.compile(r"\d[\d\.\$]*")
    forbidden = re.compile(r"[^\sA-Za-z<>]")
    
    txt = digits.sub("<num>", txt)
    txt = forbidden.sub("", txt)
    txt = txt.replace('\n', ' ').replace('>', '> ').replace('<', ' <').lower()
       
    return txt

In [4]:
def stemming(txt):
    """
    This function performs stemming and lemmatization for the text.
    It returns a list of words in the comment.
    """
    
    result = [porter.stem(word) if len(word) <= 30 else '<long>' for word in txt.split()]
    return result

In [5]:
def word_count_per_comment(txt, total_vocab):
    """
    This function counts words in a comment.
    """
    
    word_count = collections.Counter([w for w in txt if w in total_vocab])
    return dict(word_count)

In [6]:
def make_lifts_reports_based_on_column_that_contains_set_of_items(df, columnOfItems, CLASS_LABEL):
    """
    Returns pandas df with lifts report per item (index).
    params:
        df - the df with all the users per item (all the items are in a set on one column) and the class label is in another column.
        columnOfItems - the column name that contains the set of items.
        CLASS_LABEL - the column to check lift to.
    """   

    total_rows = len(df[~pd.isna(df[CLASS_LABEL])])
    total_rows_label_1 = len(df[df[CLASS_LABEL] == 1])
    total_rows_label_0 = total_rows - total_rows_label_1

    ItemToCount_Feature1_Label1 = Counter()
    for ItemsSet in list(df[(df[CLASS_LABEL] == 1) & (~pd.isna(df[columnOfItems])) & (df[columnOfItems] != 'null')][columnOfItems]):
        for Item in ItemsSet:
            ItemToCount_Feature1_Label1[Item] += 1          

    ItemToCount_Feature1_Label0 = Counter()
    for ItemsSet in list(df[(df[CLASS_LABEL] == 0) & (~pd.isna(df[columnOfItems])) & (df[columnOfItems] != 'null')][columnOfItems]):
        for Item in ItemsSet:
            ItemToCount_Feature1_Label0[Item] += 1

    feature1_label1_column = pd.DataFrame({'Feature=1_Label=1': ItemToCount_Feature1_Label1})
    feature1_label0_column = pd.DataFrame({'Feature=1_Label=0': ItemToCount_Feature1_Label0})

    resultsDf = feature1_label1_column.merge(feature1_label0_column, left_index=True, right_index=True, how='outer')
    resultsDf = resultsDf.fillna(0)
    resultsDf['Feature=0_Label=1'] = resultsDf['Feature=1_Label=1'].apply(lambda x: total_rows_label_1 - x)
    resultsDf['Feature=0_Label=0'] = resultsDf['Feature=1_Label=0'].apply(lambda x: total_rows_label_0 - x)
    resultsDf['total_Feature=1'] = resultsDf['Feature=1_Label=1'] + resultsDf['Feature=1_Label=0']
    resultsDf['oddsRatio'] = (resultsDf['Feature=1_Label=1']/resultsDf['Feature=0_Label=1'])/(resultsDf['Feature=1_Label=0']/resultsDf['Feature=0_Label=0'])
    resultsDf['likelihoodRatio'] = (resultsDf['Feature=1_Label=1']/(resultsDf['Feature=1_Label=1']+resultsDf['Feature=0_Label=1']))/(resultsDf['Feature=1_Label=0']/(resultsDf['Feature=1_Label=0']+resultsDf['Feature=0_Label=0']))
    resultsDf = resultsDf[['oddsRatio','likelihoodRatio','Feature=1_Label=1','Feature=1_Label=0','Feature=0_Label=1','Feature=0_Label=0','total_Feature=1']]
    
    return resultsDf

In [7]:
def custom_auc(ground_truth, predictions):
    """
    Define AUC scoring function.
    """
    fpr, tpr, _ = roc_curve(ground_truth, predictions, pos_label=1)    
    return auc(fpr, tpr)

# set it as a standard sklearn's scorer        
my_auc = make_scorer(custom_auc, greater_is_better=True, needs_proba=True)

# Part 2: Data Processing

## Basic analysis

In [8]:
display(df_train.head(5))
print(df_train.shape)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


(159571, 8)


In [9]:
# An example for a toxic comment
df_train[df_train['toxic']==1]['comment_text'].iloc[0]

'COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK'

In [10]:
# An example for a non toxic comment
df_train[df_train.sum(axis=1) == 0]['comment_text'].iloc[2]

"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info."

In [11]:
df_train.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
df_test.shape

(153164, 2)

In [13]:
df_train['is_train'] = 1
for label in labels_list:
    df_test[label] = None
df_test['is_train'] = 0

dfFull = pd.concat([df_train,df_test])

In [14]:
dfFull = dfFull.set_index('id')
dfFull = dfFull.rename(columns=labels_dict)

In [15]:
display(dfFull.head(5))
display(dfFull.tail(5))
print(dfFull.shape)

Unnamed: 0_level_0,comment_text,toxic_label,severe_toxic_label,obscene_label,threat_label,insult_label,identity_hate_label,is_train
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,1
000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,1
000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,1
0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,1
0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,1


Unnamed: 0_level_0,comment_text,toxic_label,severe_toxic_label,obscene_label,threat_label,insult_label,identity_hate_label,is_train
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
fffcd0960ee309b5,". \n i totally agree, this stuff is nothing bu...",,,,,,,0
fffd7a9a6eb32c16,== Throw from out field to home plate. == \n\n...,,,,,,,0
fffda9e8d6fafa9e,""" \n\n == Okinotorishima categories == \n\n I ...",,,,,,,0
fffe8f1340a79fc2,""" \n\n == """"One of the founding nations of the...",,,,,,,0
ffffce3fb183ee80,""" \n :::Stop already. Your bullshit is not wel...",,,,,,,0


(312735, 8)


In [16]:
dfFull['orig_comment'] = dfFull['comment_text']
dfFull['comment_len_total'] = dfFull['comment_text'].apply(lambda txt: len(txt))
dfFull['comment_len_words'] = dfFull['comment_text'].apply(lambda txt: len(txt.split()))

## Word tokenization

In [17]:
dfFull['comment_text'] = dfFull['comment_text'].apply(lambda txt: cleanText(txt))

In [18]:
dfFull['comment_text'].head(5)

id
0000997932d777bf    explanation why the edits made under my userna...
000103f0d9cfb60f    daww he matches this background colour im seem...
000113f07ec002fd    hey man im really not trying to edit war its j...
0001b41b1c6bb37e     more i cant make any real suggestions on impr...
0001d958c54c6e35    you sir are my hero any chance you remember wh...
Name: comment_text, dtype: object

## Text stemming and lemmatization

In [19]:
dfFull['comment_text'] = dfFull['comment_text'].apply(lambda txt: stemming(txt))

In [20]:
dfFull['comment_text'].head(5)

id
0000997932d777bf    [explan, whi, the, edit, made, under, my, user...
000103f0d9cfb60f    [daww, he, match, thi, background, colour, im,...
000113f07ec002fd    [hey, man, im, realli, not, tri, to, edit, war...
0001b41b1c6bb37e    [more, i, cant, make, ani, real, suggest, on, ...
0001d958c54c6e35    [you, sir, are, my, hero, ani, chanc, you, rem...
Name: comment_text, dtype: object

## Build a bag of words and remove stop words

In [21]:
total_word_count = collections.Counter()
for txt in dfFull['comment_text']:
    total_word_count += collections.Counter(txt)
    
total_words = sum(total_word_count.values())
print(total_words)

19783301


In [22]:
min_coverage = 200
max_coverage = 50000

total_vocab = {w for w,c in total_word_count.most_common() if min_coverage<=c<=max_coverage}
print(len(total_vocab))

4555


## TF-IDF

In [23]:
word_count_per_id = {}
for comment_id, txt in zip(dfFull.index, dfFull['comment_text']):
    word_count_per_id[comment_id] = word_count_per_comment(txt, total_vocab)

In [24]:
tfidf_per_id = {}
for comment_id, curr_word_count in word_count_per_id.items():
    tfidf_per_id[comment_id] = {w : (c/total_word_count[w]) for w,c in curr_word_count.items()}

## Organize the words as feature columns in a data frame

In [25]:
wordsDf = pd.DataFrame(tfidf_per_id).T.fillna(0.0)

In [26]:
columnsToAdd = ['comment_len_total','comment_len_words','is_train'] + new_label_list
wordsDf[columnsToAdd] = dfFull[columnsToAdd]

In [27]:
display(wordsDf.head(5))
print(wordsDf.shape)

Unnamed: 0,<,<long>,>,aa,ab,abandon,abbrevi,abc,abid,abil,...,zu,comment_len_total,comment_len_words,is_train,toxic_label,severe_toxic_label,obscene_label,threat_label,insult_label,identity_hate_label
0000997932d777bf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,264,43,1,0,0,0,0,0,0
000103f0d9cfb60f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,112,17,1,0,0,0,0,0,0
000113f07ec002fd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,233,42,1,0,0,0,0,0,0
0001b41b1c6bb37e,0.0,7e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,622,113,1,0,0,0,0,0,0
0001d958c54c6e35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,67,13,1,0,0,0,0,0,0


(312735, 4564)


In [28]:
featuresToSelect = [x for x in list(wordsDf.columns) if x not in ['is_train']+new_label_list]

## Check if the presence of a word in a comment is indicative

* As can be seen, the presence of a word in a particular comment is very indicative of whether the response is violent or not.

In [29]:
# A sanity check - does the word 'fuck' increase the likelihood of a comment being toxic?
print(f'Total count of the word "fuck": {total_word_count["fuck"]}')
print(f'Percent of comments with the word "fuck" that were labeled as toxic: {round(wordsDf[wordsDf["fuck"] > 0]["toxic_label"].mean()*100,2)}%')
print(f'Percent of comments without the word "fuck" that were labeled as toxic: {round(wordsDf[wordsDf["fuck"] == 0]["toxic_label"].mean()*100,2)}%')

Total count of the word "fuck": 45241
Percent of comments with the word "fuck" that were labeled as toxic: 94.28%
Percent of comments without the word "fuck" that were labeled as toxic: 7.45%


In [30]:
dfFull['comment_text_set'] = dfFull['comment_text'].apply(lambda listOfWords: set(listOfWords) & total_vocab)
liftsDf = make_lifts_reports_based_on_column_that_contains_set_of_items(dfFull, 'comment_text_set', 'toxic_label')
display(liftsDf[liftsDf['total_Feature=1']>=100].sort_values(by='likelihoodRatio', ascending=False).head(10))
display(liftsDf[liftsDf['total_Feature=1']>=100].sort_values(by='likelihoodRatio', ascending=False).tail(10))

Unnamed: 0,oddsRatio,likelihoodRatio,Feature=1_Label=1,Feature=1_Label=0,Feature=0_Label=1,Feature=0_Label=0,total_Feature=1
motherfuck,1067.391845,1051.842912,223.0,2.0,15071.0,144275.0,225.0
fuckin,559.867355,553.436032,176.0,3.0,15118.0,144274.0,179.0
cocksuck,469.997532,466.961652,99.0,2.0,15195.0,144275.0,101.0
fucker,302.474951,298.729676,190.0,6.0,15104.0,144271.0,196.0
faggot,169.348175,164.361793,453.0,26.0,14841.0,144251.0,479.0
fuck,204.865196,155.61177,3695.0,224.0,11599.0,144053.0,3919.0
cock,115.917503,113.565654,313.0,26.0,14981.0,144251.0,339.0
bitch,103.188295,97.983334,779.0,75.0,14515.0,144202.0,854.0
pussi,90.508486,89.618903,152.0,16.0,15142.0,144261.0,168.0
asshol,90.809527,87.121782,628.0,68.0,14666.0,144209.0,696.0


Unnamed: 0,oddsRatio,likelihoodRatio,Feature=1_Label=1,Feature=1_Label=0,Feature=0_Label=1,Feature=0_Label=0,total_Feature=1
talklist,0.0,0.0,0.0,129.0,15294.0,144148.0,129.0
wikipediafil,0.0,0.0,0.0,145.0,15294.0,144132.0,145.0
wpr,0.0,0.0,0.0,348.0,15294.0,143929.0,348.0
paramet,0.0,0.0,0.0,166.0,15294.0,144111.0,166.0
backlog,0.0,0.0,0.0,104.0,15294.0,144173.0,104.0
stylecolor,0.0,0.0,0.0,183.0,15294.0,144094.0,183.0
styleverticalaligntop,0.0,0.0,0.0,221.0,15294.0,144056.0,221.0
wikipediafair,0.0,0.0,0.0,166.0,15294.0,144111.0,166.0
backgroundcolorf,0.0,0.0,0.0,131.0,15294.0,144146.0,131.0
doctrin,0.0,0.0,0.0,135.0,15294.0,144142.0,135.0


# Modeling

## Dimension reduction - PCA

* ~4500 features were too much for sklearn on my computer, so I decided to deal with it using a dimension reduction of type PCA

In [31]:
X = wordsDf[wordsDf['is_train'] == 1][featuresToSelect].reset_index(drop=True)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.astype(float))

In [32]:
pca = PCA(n_components=100)
X_reduced = pca.fit_transform(X_scaled)

## Fit the model to the data

* I chose to use the logistic regression model because the features I built are what words appear in each comment, so the model will add weights for each word - how much does this specific word help to predict if the comment is toxic?

In [33]:
clfModel = LogisticRegression()
paramGrid={'penalty':['l1','l2'], 'C':np.logspace(-4, 2, 10), 'solver': ['liblinear'], 'max_iter':[500]}

allFittedModels = {}
allModelsScores = {}

for label in new_label_list:
    curr_y = wordsDf[wordsDf['is_train'] == 1][label].reset_index(drop=True).astype(int)
    
    clfTuned = GridSearchCV(clfModel, param_grid=paramGrid, cv=3, scoring=my_auc, iid=False)
    clfTuned.fit(X_reduced, curr_y)
    allFittedModels[label] = clfTuned.best_estimator_
    allModelsScores[label] = {'AUC': clfTuned.best_score_}

In [34]:
allModelsResultsDf = pd.DataFrame(allModelsScores).T
display(allModelsResultsDf)

Unnamed: 0,AUC
toxic_label,0.874639
severe_toxic_label,0.916074
obscene_label,0.869305
threat_label,0.900481
insult_label,0.882773
identity_hate_label,0.88661


## Predict the labels of the test dataset

In [35]:
X_test = wordsDf[wordsDf['is_train'] == 0][featuresToSelect]
X_test_scaled = scaler.transform(X_test.astype(float))
X_test_reduced = pca.transform(X_test_scaled)

In [36]:
predictionsDf = pd.DataFrame({'id': X_test.index})

In [37]:
# Fill the predictions per label.
for label in new_label_list:
    predictionsDf[label] = allFittedModels[label].predict_proba(X_test_reduced)[:,1]

In [38]:
# Return the labels' names to the original names.
predictionsDf.columns = [x.replace('_label', '') for x in list(predictionsDf.columns)]
display(predictionsDf.head(5))
print(predictionsDf.shape)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.984497,0.016741,0.657404,0.006302,0.642728,0.034293
1,0000247867823ef7,0.100735,0.016242,0.071947,0.007063,0.066894,0.022466
2,00013b17ad220c46,0.146792,0.022061,0.090139,0.007673,0.087786,0.024616
3,00017563c3f7919a,0.030779,0.008154,0.022954,0.005574,0.030029,0.019585
4,00017695ad8997eb,0.161995,0.02387,0.101455,0.008222,0.102282,0.025097


(153164, 7)


In [39]:
# Save the results to a csv file.
predictionsDf.to_csv('NLP_toxic_comments_submission.csv', index=False)