# Data Source - Toxic Comment Classification Challenge (Kaggle)

In [34]:
import pandas as pd, numpy as np

from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import TfidfVectorizer

import config as cfg

In [13]:
train = pd.read_csv(cfg.RESOURCE.toxic_comment_classification_train)
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [22]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train_toxic = train[train[label_cols].max(axis=1) >= 1].copy()
train_toxic['comment_text'].fillna("unknown", inplace=True)
train_toxic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16225 entries, 6 to 159554
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             16225 non-null  object
 1   comment_text   16225 non-null  object
 2   toxic          16225 non-null  int64 
 3   severe_toxic   16225 non-null  int64 
 4   obscene        16225 non-null  int64 
 5   threat         16225 non-null  int64 
 6   insult         16225 non-null  int64 
 7   identity_hate  16225 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 1.1+ MB


In [6]:
test = pd.read_csv(cfg.RESOURCE.toxic_comment_classification_test)
test_labels = pd.read_csv(cfg.RESOURCE.toxic_comment_classification_test_labels)
test = pd.merge(test, test_labels, on='id', how='inner')

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,":If you have a look back at the source, the in...",-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,I don't anonymously edit articles at all.,-1,-1,-1,-1,-1,-1


In [25]:
test_toxic = test[test[label_cols].max(axis=1) >= 1].copy()
test_toxic['comment_text'].fillna("unknown", inplace=True)
test_toxic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6243 entries, 21 to 153155
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             6243 non-null   object
 1   comment_text   6243 non-null   object
 2   toxic          6243 non-null   int64 
 3   severe_toxic   6243 non-null   int64 
 4   obscene        6243 non-null   int64 
 5   threat         6243 non-null   int64 
 6   insult         6243 non-null   int64 
 7   identity_hate  6243 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 439.0+ KB


In [31]:
all_doc = pd.concat([train_toxic, test_toxic])
all_doc.info()
all_doc.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22468 entries, 6 to 153155
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             22468 non-null  object
 1   comment_text   22468 non-null  object
 2   toxic          22468 non-null  int64 
 3   severe_toxic   22468 non-null  int64 
 4   obscene        22468 non-null  int64 
 5   threat         22468 non-null  int64 
 6   insult         22468 non-null  int64 
 7   identity_hate  22468 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 1.5+ MB


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
12,0005c987bdfc9d4b,Hey... what is it..\n@ | talk .\nWhat is it......,1,0,0,0,0,0
16,0007e25b2121310b,"Bye! \n\nDon't look, come or think of comming ...",1,0,0,0,0,0
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1
43,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0


In [52]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

n = all_doc.shape[0]
vec = TfidfVectorizer(ngram_range=(2,2), tokenizer=None, max_features=10000,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1, stop_words='english')
all_tf_idf = vec.fit_transform(all_doc['comment_text'])

In [49]:
all_toxic = all_doc['toxic'].values
all_severe_toxic = all_doc['severe_toxic'].values
all_obscene = all_doc['obscene'].values
all_threat = all_doc['threat'].values
all_insult = all_doc['insult'].values
all_identity_hate = all_doc['identity_hate'].values

In [61]:
mi_toxic = mutual_info_classif(all_tf_idf, all_toxic, n_neighbors=3, random_state=2020)
mi_severe_toxic = mutual_info_classif(all_tf_idf, all_severe_toxic, n_neighbors=3, random_state=2020)
mi_obscene = mutual_info_classif(all_tf_idf, all_obscene, n_neighbors=3, random_state=2020)
mi_threat = mutual_info_classif(all_tf_idf, all_threat, n_neighbors=3, random_state=2020)
mi_insult = mutual_info_classif(all_tf_idf, all_insult, n_neighbors=3, random_state=2020)
mi_identity_hate = mutual_info_classif(all_tf_idf, all_identity_hate, n_neighbors=3, random_state=2020)

In [62]:
features = np.array(vec.get_feature_names())

## (1/6) Mutual information score ranking for "Toxic" (top 100)

In [63]:
top_mi_toxic = np.argsort(mi_toxic)[::-1][:100]
features[top_mi_toxic]


array(['talk page', 'don know', 'personal attacks', 'don think',
       'user page', 'people like', 'personal attack', 'don want',
       'don care', 'just like', 'edit summary', 'wikipedia org',
       'en wikipedia', 'looks like', 'talk contribs', 'don like',
       'http en', 'user talk', 'http www', 'piece shit', 'look like',
       'feel free', 'comment added', 'unsigned comment',
       'preceding unsigned', 'edit war', 'ip address',
       'editing wikipedia', 'doesn mean', 'feel like', 'discussion page',
       'things like', 'blocked editing', 'org wiki', 'let know',
       'reliable source', 'll just', 'just don', 'does mean',
       'wikipedia just', 'wasting time', 'acting like', 'like know',
       'edit warring', 'allowed edit', 'stop making', 'people just',
       'real life', 'dont know', 'waste time', 'edit page',
       'wikipedia policy', 'just trying', 'reverting edits',
       'just thought', 've seen', 'pretty sure', 'death threats',
       'belong article', 'did 

## (2/6) Mutual information score ranking for "Severe Toxic" (top 100)

In [64]:
top_mi_severe_toxic = np.argsort(mi_severe_toxic)[::-1][:100]
features[top_mi_severe_toxic]

array(['piece shit', 'fuck fuck', 'suck dick', 'talk page', 'son bitch',
       'mother fucker', 'fuck fucking', 'bitch ass', 'shut fuck',
       'fuck asshole', 'fucking faggot', 'fuck bitch', 'don know',
       'eat shit', 'shit fuck', 'bitch fuck', 'fucking cunt',
       'fucking bitch', 'fuck wikipedia', 'little bitch', 'big fat',
       'people like', 'don care', 'hope die', 'suck cock', 'year old',
       'rot hell', 'mother fucking', 'fucking shit', 'fuck stupid',
       'don fuck', 'burn hell', 'fucking loser', 'fuck mother',
       'cock sucking', 'fucking piece', 'fucking asshole', 'asshole fuck',
       'ip address', 'ass bitch', 'fuck ass', 'stupid fucking',
       'fucking life', 'fucking gay', 'cunt fuck', 'fuck little',
       'fucking cock', 'fuck ur', 'fuck cunt', 'cock sucker',
       'fucking moron', 'just like', 'gay ass', 'user page', 'bitch suck',
       'bitch fucking', 'god damn', 'don want', 'stupid cunt',
       'shit fucking', 'fucking bastard', 'like shit', 

## (3/6) Mutual information score ranking for "Obscene" (top 100)

In [65]:
top_mi_obscene = np.argsort(mi_obscene)[::-1][:100]
features[top_mi_obscene]

array(['talk page', 'don know', 'piece shit', 'people like',
       'personal attack', 'fuck fuck', 'don care', 'don like',
       'personal attacks', 'don think', 'user page', 'don want',
       'suck dick', 'just like', 'son bitch', 'year old', 'comment added',
       'wikipedia org', 'unsigned comment', 'preceding unsigned',
       'ip address', 'en wikipedia', 'look like', 'http www', 'shut fuck',
       'looks like', 'editing wikipedia', 'mother fucker', 'talk pages',
       'real life', 'org wiki', 'http en', 'fuck wikipedia', 'act like',
       'good faith', 'don understand', 'fucking idiot', 'waste time',
       've seen', 'eat shit', 'acting like', 'dont know', 'let know',
       'fuck fucking', 'user talk', 'suck cock', 'ha ha', 'doesn mean',
       'talk contribs', 'high school', 'don fuck', 'fucking cunt',
       'little bitch', 'just want', 'point view', 'bitch ass',
       'shit fuck', 'dont care', 'hope die', 'god damn', 'edit page',
       'make sure', 'just wanted', 'l

## (4/6) Mutual information score ranking for "Threat" (top 100)

In [66]:
top_mi_threat = np.argsort(mi_threat)[::-1][:100]
features[top_mi_threat]

array(['hope die', 'piece shit', 'going kill', 'im going', 'talk page',
       'burn hell', 'fucking kill', 'll kill', 'shit fuck', 'die die',
       'know live', 'come house', 'gonna kill', 'son bitch',
       'deserve die', 'fuck fucking', 'don know', 'kick ass',
       'die fucking', 'eat shit', 'fuck die', 'mother fucking',
       'im gonna', 'life fucking', 'fucking die', 'mother fucker',
       'house kill', 'hunt kill', 'united states', 'kill ya',
       'fucking faggot', 'people like', 'just fucking', 'time ll',
       'bitch ass', 'painful death', 'kill fucking', 'die hell',
       'll fucking', 'kill gun', 'kill going', 'burn house', 'year old',
       'going come', 'fuckin kill', 'real life', 'fuck fuck', 'shit die',
       'worthless piece', 'fucking life', 'make sure', 'fuck bitch',
       'rot hell', 'bitch fuck', 'little shit', 'fuck ass', 'shoot head',
       'brains splatter', 'splatter ground', 'laugh brains', 'head laugh',
       'going shoot', 'shit fucking', 'just 

## (5/6) Mutual information score ranking for "Insult" (top 100)

In [67]:
top_mi_insult = np.argsort(mi_insult)[::-1][:100]
features[top_mi_insult]

array(['talk page', 'don know', 'piece shit', 'people like', 'don care',
       'personal attacks', 'personal attack', 'don think', 'fuck fuck',
       'don like', 'user page', 'just like', 'don want', 'son bitch',
       'suck dick', 'year old', 'shut fuck', 'wikipedia org',
       'ip address', 'comment added', 'unsigned comment',
       'preceding unsigned', 'look like', 'en wikipedia', 'http www',
       'looks like', 'mother fucker', 'editing wikipedia',
       'fucking idiot', 'talk pages', 'org wiki', 'real life', 'http en',
       'don understand', 'act like', 'fuck fucking', 'fuck wikipedia',
       'acting like', 'eat shit', 'let know', 'good faith', 've seen',
       'waste time', 'dont know', 'user talk', 'hope die', 'bitch ass',
       'fucking cunt', 'shit fuck', 'don fuck', 'high school', 'ha ha',
       'doesn mean', 'god damn', 'suck cock', 'little bitch', 'll just',
       'talk contribs', 'just want', 'dont care', 'make sure',
       'fuck bitch', 'just wanted', 'jus

## (6/6) Mutual information score ranking for "Identity Hate" (top 100)

In [68]:
top_mi_identity_hate = np.argsort(mi_identity_hate)[::-1][:100]
features[top_mi_identity_hate]

array(['piece shit', 'talk page', 'fucking gay', 'don know', 'bitch ass',
       'fucking faggot', 'people like', 'just like', 'gay ass',
       'suck dick', 'son bitch', 'fuck fuck', 'black people', 'don like',
       'fuck fucking', 'mother fucker', 'eat shit', 'fuck faggot',
       'gay gay', 'fuck nigger', 'cock sucker', 'shit fuck', 'don care',
       'fucking life', 'suck cock', 'shut fuck', 'user page',
       'nigga fuck', 'don want', 'nigger nigger', 'big fat', 'year old',
       'bitch fuck', 'little bitch', 'look like', 'looks like',
       'fucking bitch', 'white trash', 'personal attack', 'gay faggot',
       'hope die', 'like gay', 'fucking nigger', 'fuck wikipedia',
       'personal attacks', 'shit fucking', 'cunt fuck', 'fucking stupid',
       'fucking cunt', 'nigger lover', 'ass bitch', 'fuck nigga',
       'ass nigga', 'fuck bitch', 'fucking idiot', 'white people',
       'fuck think', 'ur gay', 'did delete', 'life fucking',
       'fuck stupid', 'faggot ass', 'neo n