# Toxic comment analysis

---


## Load libraries


In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import nltk

nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/louislecouturier/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/louislecouturier/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Load Dataset


In [3]:
data = pd.read_csv("datasets/train.csv")
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


### Clean the data


In [4]:
print("Number of missing values in each column :")
print(data.isnull().sum())

Number of missing values in each column :
id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64


We will remove the rows with missing values and remove the id column as it is not relevant.


In [5]:
data = data.dropna()
del data["id"]

Our data is now clean and ready for analysis !


In [6]:
data.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
data[data["obscene"] == 1]

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
42,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1
43,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0
51,GET FUCKED UP. GET FUCKEEED UP. GOT A DRINK T...,1,0,1,0,0,0
55,Stupid peace of shit stop deleting my stuff as...,1,1,1,0,1,0
...,...,...,...,...,...,...,...
159411,Fat piece of shit \n\nyou obese piece of shit....,1,0,1,0,1,0
159493,FUCKING FAGGOT \n\nLOLWAT.,1,0,1,0,1,0
159494,"""\n\n our previous conversation \n\nyou fuckin...",1,0,1,0,1,1
159541,Your absurd edits \n\nYour absurd edits on gre...,1,0,1,0,1,0


### Loose analysis data

We will first remove the `toxic`, `severe_toxic`, `obscene`, `threat`, `insult`, `identity_hate` columns to simplify the problem.


In [8]:
data.drop(
    ["severe_toxic", "obscene", "threat", "insult", "identity_hate"],
    axis=1,
    inplace=True,
)
data.head()

Unnamed: 0,comment_text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


### Clean the comments


In [9]:
data["comment_text"].head()

0    Explanation\nWhy the edits made under my usern...
1    D'aww! He matches this background colour I'm s...
2    Hey man, I'm really not trying to edit war. It...
3    "\nMore\nI can't make any real suggestions on ...
4    You, sir, are my hero. Any chance you remember...
Name: comment_text, dtype: object

In [10]:
from helpers.data.text_manipulation import TextManipulation

text_manipulator = TextManipulation()

comments = data["comment_text"]
comments = comments.map(lambda x: text_manipulator.remove_contractions(x))
comments = comments.map(lambda x: text_manipulator.remove_abbreviations(x))
comments = comments.map(lambda x: text_manipulator.remove_punctuation(x))

comments

0         Explanation Why the edits made under my userna...
1         Daww He matches this background colour I am se...
2         Hey man I am really not trying to edit war It ...
3          More I cannot make any real suggestions on im...
4         You sir are my hero Any chance you remember wh...
                                ...                        
159566    And for the second time of asking when your vi...
159567    You should be ashamed of yourself That is a ho...
159568    Spitzer Umm there is no actual article for pro...
159569    And it looks like it was actually you who put ...
159570     And  I really do not think you understand I c...
Name: comment_text, Length: 159571, dtype: object

### Tokenize the comments

In [11]:
comments = comments[:5000]

tokens = text_manipulator.tokenize(comments)

In [12]:
print(tokens)



### Remove the stopwords

In [13]:
stop_words = nltk.corpus.stopwords.words("english")

In [14]:
# Remove the stop words
tokens = [text_manipulator.remove_stopwords(x, stop_words) for x in tokens]

tokens

[['Explanation',
  'Why',
  'edits',
  'made',
  'username',
  'Hardcore',
  'Metallica',
  'Fan',
  'reverted',
  'They',
  'vandalisms',
  'closure',
  'GAs',
  'I',
  'voted',
  'New',
  'York',
  'Dolls',
  'FAC',
  'And',
  'please',
  'remove',
  'template',
  'talk',
  'page',
  'since',
  'I',
  'retired',
  'now892053827'],
 ['Daww',
  'He',
  'matches',
  'background',
  'colour',
  'I',
  'seemingly',
  'stuck',
  'Thanks',
  'talk',
  '2151',
  'January',
  '11',
  '2016',
  'UTC'],
 ['Hey',
  'man',
  'I',
  'really',
  'trying',
  'edit',
  'war',
  'It',
  'guy',
  'constantly',
  'removing',
  'relevant',
  'information',
  'talking',
  'edits',
  'instead',
  'talk',
  'page',
  'He',
  'seems',
  'care',
  'formatting',
  'actual',
  'info'],
 ['More',
  'I',
  'make',
  'real',
  'suggestions',
  'improvement',
  'I',
  'wondered',
  'section',
  'statistics',
  'later',
  'subsection',
  'types',
  'accidents',
  'I',
  'think',
  'references',
  'may',
  'need',
  

## Analyse the text

### TF_IDF

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(tokens)

X_train, X_test, y_train, y_test = train_test_split(X, y)


AttributeError: 'list' object has no attribute 'lower'