**Jigsaw Rate Severity of Toxic Comments**

Raquel Alcaraz

Lucie Bertiere

# Libraries

In [39]:
import pandas as pd
import os
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from google.colab import drive, files

import re
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import TfidfVectorizer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import Sequential
from keras import layers

# Importing the data

In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# We import the two datasets
comments_to_score = pd.read_csv("comments_to_score.csv")
train = pd.read_csv("validation_data.csv")

In [None]:
comments_to_score.head(5)

Unnamed: 0,comment_id,text
0,114890,"""\n \n\nGjalexei, you asked about whether ther..."
1,732895,"Looks like be have an abuser , can you please ..."
2,1139051,I confess to having complete (and apparently b...
3,1434512,"""\n\nFreud's ideas are certainly much discusse..."
4,2084821,It is not just you. This is a laundry list of ...


In [None]:
train.head(5)

Unnamed: 0,worker,less_toxic,more_toxic
0,313,This article sucks \n\nwoo woo wooooooo,WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!...
1,188,"""And yes, people should recognize that but the...",Daphne Guinness \n\nTop of the mornin' my fav...
2,82,"Western Media?\n\nYup, because every crime in...","""Atom you don't believe actual photos of mastu..."
3,347,And you removed it! You numbskull! I don't car...,You seem to have sand in your vagina.\n\nMight...
4,539,smelly vagina \n\nBluerasberry why don't you ...,"hey \n\nway to support nazis, you racist"


# Cleaning the data

## Function

In [6]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [7]:
# We set the stop words in english (the words that we don't want in the clean text
# that are the most used and not really relevant)
stop_words = set(stopwords.words('english'))
# and also the lemmatizer (rocks becomes rock...)
lem = WordNetLemmatizer()


def cleaning_text(text):
    
    # to put everything in lower case (I becomes i)
    text = text.lower()

    # to remove urls
    url = re.compile(r'https?://\S+|www\.\S+')
    text = url.sub(r' ', text)

    # to remove html tags
    html = re.compile(r'<.*?>')
    text = html.sub(r' ', text)
    
    # to remove digits
    text = re.sub(r'\d+',' ', text)
    
    # to remove hashtags
    text = re.sub(r'#\w+',' ', text)
    
    # to remove mentions
    text = re.sub(r'@\w+',' ', text)

    # to remove punctations
    text = re.sub(r"[^\w\s\d]",' ', text)

    # to remove extra white space
    text = re.sub(r"\s+",' ',text).strip()
    
    # removing stop words
    token = word_tokenize(text)
    text=[word for word in token if not word in stop_words]

    # lemmatize
    text = " ".join([lem.lemmatize(t) for t in text])

    return  text
  

## Comments to score

In [8]:
comments_to_score['clean_text'] = comments_to_score['text'].apply(cleaning_text)

In [None]:
comments_to_score.head(5)

Unnamed: 0,comment_id,text,clean_text
0,114890,"""\n \n\nGjalexei, you asked about whether ther...",gjalexei asked whether anti editorializing pol...
1,732895,"Looks like be have an abuser , can you please ...",look like abuser please look thanks
2,1139051,I confess to having complete (and apparently b...,confess complete apparently blissful ignorance...
3,1434512,"""\n\nFreud's ideas are certainly much discusse...",freud idea certainly much discussed today woul...
4,2084821,It is not just you. This is a laundry list of ...,laundry list stupid allegation scooped god kno...


# Idea 1 

See how many times a comment has been labeled toxic or non-toxic.


## Train dataset

### Putting all the comments to a same column

**Less toxic data**

In [9]:
# creating a dataframe with only the less toxic comments
less_toxic_data = train.drop(columns = ['more_toxic'])
# Renaming the column to be able to concatenate afterwards
less_toxic_data = less_toxic_data.rename(columns={"less_toxic": "text"})
# Adding a 0's toxicity column to say that this comment is less toxic for the worker
less_toxic_data['toxicity'] = 0

In [10]:
less_toxic_data.head(5)

Unnamed: 0,worker,text,toxicity
0,313,This article sucks \n\nwoo woo wooooooo,0
1,188,"""And yes, people should recognize that but the...",0
2,82,"Western Media?\n\nYup, because every crime in...",0
3,347,And you removed it! You numbskull! I don't car...,0
4,539,smelly vagina \n\nBluerasberry why don't you ...,0


**More toxic data**

In [11]:
# creating a dataframe with only the more toxic comments
more_toxic_data = train.drop(columns = ['less_toxic'])
# Renaming the column to be able to concatenate afterwards
more_toxic_data = more_toxic_data.rename(columns={"more_toxic": "text"})
# Adding a 1's toxicity column to say that this comment is more toxic for the worker
more_toxic_data['toxicity'] = 1

In [12]:
more_toxic_data.head(5)

Unnamed: 0,worker,text,toxicity
0,313,WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!...,1
1,188,Daphne Guinness \n\nTop of the mornin' my fav...,1
2,82,"""Atom you don't believe actual photos of mastu...",1
3,347,You seem to have sand in your vagina.\n\nMight...,1
4,539,"hey \n\nway to support nazis, you racist",1


**Concatenate**

In [13]:
# The final shape we should have
more_toxic_data.shape[0]+less_toxic_data.shape[0]

60216

In [14]:
# The concatenation
workers_comments = pd.concat([less_toxic_data, more_toxic_data], axis=0)
workers_comments.head(5)

Unnamed: 0,worker,text,toxicity
0,313,This article sucks \n\nwoo woo wooooooo,0
1,188,"""And yes, people should recognize that but the...",0
2,82,"Western Media?\n\nYup, because every crime in...",0
3,347,And you removed it! You numbskull! I don't car...,0
4,539,smelly vagina \n\nBluerasberry why don't you ...,0


In [15]:
# Checking the shape
workers_comments.shape[0]

60216

### Aggregation by comment

In [16]:
# Aggregate the comments together and take the mean for the toxicity
aggregated_comments = workers_comments.groupby('text').aggregate({'toxicity': 'mean'}).reset_index()
aggregated_comments.head(5)

Unnamed: 0,text,toxicity
0,\n\nThe comment directly above this one are fr...,0.666667
1,\n\nwhy should people have to read crap posted...,0.888889
2,\nGo F yourself you cottonheadednittymuggins.,0.666667
3,"\nU POUR ADMIN, U UPDATE VANDAL COUNT, WHILE R...",0.0
4,\n\n koreans claim \n\ni see youve seen the ...,0.333333


In [17]:
# In total we have 14521 comments
aggregated_comments.shape

(14251, 2)

In [18]:
# In total we have 39 degrees of toxicity
np.unique(aggregated_comments['toxicity'])

array([0.        , 0.08333333, 0.1       , 0.11111111, 0.13333333,
       0.14285714, 0.16666667, 0.2       , 0.22222222, 0.25      ,
       0.26666667, 0.27777778, 0.28571429, 0.33333333, 0.38888889,
       0.4       , 0.41666667, 0.42857143, 0.44444444, 0.46153846,
       0.46666667, 0.5       , 0.53333333, 0.55555556, 0.57142857,
       0.58333333, 0.6       , 0.66666667, 0.71428571, 0.73333333,
       0.75      , 0.77777778, 0.8       , 0.83333333, 0.85714286,
       0.88888889, 0.91666667, 0.93333333, 1.        ])

### Cleaning the data

In [23]:
aggregated_comments['clean_text'] = aggregated_comments['text'].apply(cleaning_text)
aggregated_comments.head(5)

Unnamed: 0,text,toxicity,score,clean_text
0,\n\nThe comment directly above this one are fr...,0.666667,1,comment directly one strange detractor mine cp...
1,\n\nwhy should people have to read crap posted...,0.888889,1,people read crap posted idiot like
2,\nGo F yourself you cottonheadednittymuggins.,0.666667,1,go f cottonheadednittymuggins
3,"\nU POUR ADMIN, U UPDATE VANDAL COUNT, WHILE R...",0.0,0,u pour admin u update vandal count real admin ...
4,\n\n koreans claim \n\ni see youve seen the ...,0.333333,0,korean claim see youve seen crap claim also


## Training on the train dataset

**Some preparation**

In [29]:
# splitting the dataframe
# Here we split validation data to optimize classifier during training. 
X_train, X_test, y_train, y_test = train_test_split(aggregated_comments['clean_text'], aggregated_comments['toxicity'], 
                                                          test_size=0.2, random_state=1)

In [30]:
X_train = pd.DataFrame(data=X_train)
X_test = pd.DataFrame(data=X_test)
y_train = pd.DataFrame(data=y_train)
y_test = pd.DataFrame(data=y_test)

In [32]:
# We take only the words that appear 5 times in all the comments
# And we take the 1000 words with the best TFIDF
vectorizer = TfidfVectorizer(min_df = 5, max_features=1000)

X_train_tfidf = vectorizer.fit_transform(X_train['clean_text'])

**Building the model**

In [112]:
model = Ridge(random_state=1, alpha = 10, tol = 1e-5, solver = 'lbfgs', positive=True)
model.fit(X_train_tfidf, y_train['toxicity'])

Ridge(alpha=10, positive=True, random_state=1, solver='lbfgs', tol=1e-05)

In [113]:
# Predicting on the test set
X_test_tfidf = vectorizer.transform(X_test['clean_text'])
pred = model.predict(X_test_tfidf)

In [114]:
rms = mean_squared_error(y_test['toxicity'], pred, squared=False)
print(rms)

0.3402362507167969


**Prediction**

In [43]:
# Doing it for the whole training dataset
X_tfidf = vectorizer.fit_transform(aggregated_comments['clean_text'])
# Fitting the model
model.fit(X_tfidf, aggregated_comments['toxicity'])
# Predicting on the comments to score
test = vectorizer.transform(comments_to_score['clean_text'])
pred = model.predict(test)

In [44]:
pred

array([0.33505521, 0.20593712, 0.37425886, ..., 0.33445126, 0.69285848,
       0.48514273])

**Submission file**

In [45]:
# Exporting the submission file
results = pd.DataFrame(comments_to_score["comment_id"])
results["score"] = pred

In [46]:
results.head(5)

Unnamed: 0,comment_id,score
0,114890,0.335055
1,732895,0.205937
2,1139051,0.374259
3,1434512,0.471982
4,2084821,0.595697


In [47]:
results.to_csv("submission.csv", index=False)