**Jigsaw Rate Severity of Toxic Comments**

Raquel Alcaraz

Lucie Bertiere

# Libraries

In [1]:
import pandas as pd
import os
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import re
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
from collections import Counter

from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import Sequential
from keras import layers

from sklearn.naive_bayes import MultinomialNB
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.multiclass import OneVsRestClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

import lightgbm as ltb

# Importing the data

In [2]:
comments_to_score = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv")

In [3]:
comments_to_score.head(5)

Unnamed: 0,comment_id,text
0,114890,"""\n \n\nGjalexei, you asked about whether ther..."
1,732895,"Looks like be have an abuser , can you please ..."
2,1139051,I confess to having complete (and apparently b...
3,1434512,"""\n\nFreud's ideas are certainly much discusse..."
4,2084821,It is not just you. This is a laundry list of ...


# Cleaning the data

## Function

In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>
[nltk_data] Error loading wordnet: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


False

In [5]:
# We set the stop words in english (the words that we don't want in the clean text
# that are the most used and not really relevant)
stop_words = set(stopwords.words('english'))
# and also the lemmatizer (rocks becomes rock...)
lem = WordNetLemmatizer()


def cleaning_text(text):
    
    # to put everything in lower case (I becomes i)
    text = text.lower()

    # to remove urls
    url = re.compile(r'https?://\S+|www\.\S+')
    text = url.sub(r' ', text)

    # to remove html tags
    html = re.compile(r'<.*?>')
    text = html.sub(r' ', text)
    
    # to remove digits
    text = re.sub(r'\d+',' ', text)
    
    # to remove hashtags
    text = re.sub(r'#\w+',' ', text)
    
    # to remove mentions
    text = re.sub(r'@\w+',' ', text)

    # to remove punctations
    text = re.sub(r"[^\w\s\d]",' ', text)

    # to remove extra white space
    text = re.sub(r"\s+",' ',text).strip()
    
    # removing stop words
    token = word_tokenize(text)
    text=[word for word in token if not word in stop_words]

    # lemmatize
    text = " ".join([lem.lemmatize(t) for t in text])

    return  text
  

## Comments to score

In [6]:
comments_to_score['clean_text'] = comments_to_score['text'].apply(cleaning_text)

In [7]:
comments_to_score.head(5)

Unnamed: 0,comment_id,text,clean_text
0,114890,"""\n \n\nGjalexei, you asked about whether ther...",gjalexei asked whether anti editorializing pol...
1,732895,"Looks like be have an abuser , can you please ...",look like abuser please look thanks
2,1139051,I confess to having complete (and apparently b...,confess complete apparently blissful ignorance...
3,1434512,"""\n\nFreud's ideas are certainly much discusse...",freud idea certainly much discussed today woul...
4,2084821,It is not just you. This is a laundry list of ...,laundry list stupid allegation scooped god kno...


# Idea 2

Using previous Jigsaw competition : https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/

In [8]:
# We import the dataset from "Toxic Comment Classification Challenge"
train2 = pd.read_csv("/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv")
test_labels = pd.read_csv("/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv")
test2 = pd.read_csv("/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv")

In [9]:
train2.head(10)

# Coded as 1 if it is toxic, severe toxic, etc

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [10]:
# We check how many comments there are per category: Toxic
nb_toxic = train2[train2.toxic==1]
nb_toxic.shape

(15294, 8)

In [11]:
# We check how many comments there are per category: severe toxic
nb_severe_toxic = train2[train2.severe_toxic==1]
nb_severe_toxic.shape

(1595, 8)

In [12]:
# We check how many comments there are per category: obscene
nb_obscene = train2[train2.obscene==1]
nb_obscene.shape

(8449, 8)

In [13]:
# We check how many comments there are per category: threat
nb_threat = train2[train2.threat==1]
nb_threat.shape

(478, 8)

In [14]:
# We check how many comments there are per category: insult
nb_insult = train2[train2.insult==1]
nb_insult.shape

(7877, 8)

In [15]:
# We check how many comments there are per category: identity hate
nb_id_hate = train2[train2.identity_hate==1]
nb_id_hate.shape

(1405, 8)

In [16]:
test_labels.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1


In [17]:
test2.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


### Cleaning data

In [18]:
# We clean the train dataset
train2['clean_text'] = train2['comment_text'].apply(cleaning_text)
train2.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation edits made username hardcore metal...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,aww match background colour seemingly stuck th...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man really trying edit war guy constantly ...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,make real suggestion improvement wondered sect...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,sir hero chance remember page


In [19]:
# We look at observations with severe toxic being equal to 1
train2[train2.severe_toxic==1]

# We observe that all the severe toxic comments are also considered as toxic as they are both equal to 1

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0,cocksucker piss around work
55,0020e7119b96eeeb,Stupid peace of shit stop deleting my stuff as...,1,1,1,0,1,0,stupid peace shit stop deleting stuff asshole ...
181,006e87872c8b370c,you are a stupid fuck \n\nand your mother's cu...,1,1,1,0,1,0,stupid fuck mother cunt stink
442,01208d2b76624130,Hi \n\nIm a fucking bitch.\n\n50.180.208.181,1,1,1,0,1,0,hi im fucking bitch
579,018663f910e0bfe6,What a motherfucking piece of crap those fuckh...,1,1,1,0,1,0,motherfucking piece crap fuckhead blocking u
...,...,...,...,...,...,...,...,...,...
159096,f871b05d4caa6f20,"You filthy, stinking crow! \n\nI am back! Dirt...",1,1,1,0,1,0,filthy stinking crow back dirty crow better de...
159099,f885a3e2d779342e,now\n\nyou fucking pathetic moron. \n\ncc: Ji...,1,1,1,0,1,0,fucking pathetic moron cc jimbo wale rootmyass
159281,fb726deec64157bd,LoL!! \n\nyou're GAY!! you will never know how...,1,1,1,0,1,1,lol gay never know good feel fuck woman as
159312,fbf20e312cd4a78d,"Walter Mercado \n\nAntonio, quite frankly, you...",1,1,1,0,1,0,walter mercado antonio quite frankly fucker co...


In [20]:
# We clean the test dataset
test2['clean_text'] = test2['comment_text'].apply(cleaning_text)
test2.head(5)

Unnamed: 0,id,comment_text,clean_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,yo bitch ja rule succesful ever whats hating s...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,rfc title fine imo
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",source zawe ashton lapland
3,00017563c3f7919a,":If you have a look back at the source, the in...",look back source information updated correct f...
4,00017695ad8997eb,I don't anonymously edit articles at all.,anonymously edit article


## SVD

In [21]:
# New variable with the score (categories sum)
train2['score'] = train2.toxic + train2.severe_toxic + train2.obscene + train2.threat + train2.insult + train2.identity_hate

### Train and test split

In [22]:
X_svd = train2['clean_text']
Y_svd = train2['score']

### TF-IDF

In [23]:
vectorizer = TfidfVectorizer(min_df=5, smooth_idf=True, ngram_range=(1,2))
X2 = vectorizer.fit_transform(X_svd)

In [24]:
SVD = TruncatedSVD(n_components=1000, random_state=23)
SVD.fit(X2)

Xtr_SVD = SVD.transform(X2)

### LGBM

**Model**

In [25]:
# Model
model_lgbm = ltb.LGBMRegressor(learning_rate = 0.005, num_leaves = 31, max_depth = 22, colsample_bytree = 0.7, 
                               subsample = 0.6, reg_alpha = 0.5, reg_lambda = 10, n_estimators = 70 , random_state = 23)
model_lgbm.fit(Xtr_SVD, Y_svd)

LGBMRegressor(colsample_bytree=0.7, learning_rate=0.005, max_depth=22,
              n_estimators=70, random_state=23, reg_alpha=0.5, reg_lambda=10,
              subsample=0.6)

### Prediction

In [26]:
Z = vectorizer.transform(comments_to_score['clean_text'])
X_SVD = SVD.transform(Z)
pred = model_lgbm.predict(X_SVD)
pred

array([0.17309424, 0.1726319 , 0.16671379, ..., 0.19585716, 1.00665581,
       0.18848701])

### Submission

In [27]:
# Exporting the submission file
results = pd.DataFrame(comments_to_score["comment_id"])
results["score"] = pred

In [28]:
results.head(5)

Unnamed: 0,comment_id,score
0,114890,0.173094
1,732895,0.172632
2,1139051,0.166714
3,1434512,0.166714
4,2084821,0.241119


In [29]:
results.to_csv("submission.csv", index=False)