**Jigsaw Rate Severity of Toxic Comments**

Raquel Alcaraz

Lucie Bertiere

## Libraries

In [None]:
import pandas as pd
import os
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import re
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
from collections import Counter


from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer

## Importing the data

In [None]:
comments_to_score = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv")

In [None]:
comments_to_score.head(5)

## Cleaning the data

### Function

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# We set the stop words in english (the words that we don't want in the clean text
# that are the most used and not really relevant)
stop_words = set(stopwords.words('english'))
# and also the lemmatizer (rocks becomes rock...)
lem = WordNetLemmatizer()


def cleaning_text(text):
    
    # to put everything in lower case (I becomes i)
    text = text.lower()

    # to remove urls
    url = re.compile(r'https?://\S+|www\.\S+')
    text = url.sub(r' ', text)

    # to remove html tags
    html = re.compile(r'<.*?>')
    text = html.sub(r' ', text)
    
    # to remove digits
    text = re.sub(r'\d+',' ', text)
    
    # to remove hashtags
    text = re.sub(r'#\w+',' ', text)
    
    # to remove mentions
    text = re.sub(r'@\w+',' ', text)

    # to remove punctations
    text = re.sub(r"[^\w\s\d]",' ', text)

    # to remove extra white space
    text = re.sub(r"\s+",' ',text).strip()
    
    # removing stop words
    token = word_tokenize(text)
    text=[word for word in token if not word in stop_words]

    # lemmatize
    text = " ".join([lem.lemmatize(t) for t in text])

    return  text

### Comments to score

In [None]:
comments_to_score['clean_text'] = comments_to_score['text'].apply(cleaning_text)

In [None]:
comments_to_score.head(5)

## Idea 2

Using previous Jigsaw competition : https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/

In [None]:
# We import the dataset from "Toxic Comment Classification Challenge"
train2 = pd.read_csv("/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv")
test_labels = pd.read_csv("/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv")
test2 = pd.read_csv("/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv")

In [None]:
train2.head(10)

# Coded as 1 if it is toxic, severe toxic, etc

In [None]:
# We check how many comments there are per category: Toxic
nb_toxic = train2[train2.toxic==1]
nb_toxic.shape

In [None]:
# We check how many comments there are per category: severe toxic
nb_severe_toxic = train2[train2.severe_toxic==1]
nb_severe_toxic.shape

In [None]:
# We check how many comments there are per category: obscene
nb_obscene = train2[train2.obscene==1]
nb_obscene.shape

In [None]:
# We check how many comments there are per category: threat
nb_threat = train2[train2.threat==1]
nb_threat.shape

In [None]:
# We check how many comments there are per category: insult
nb_insult = train2[train2.insult==1]
nb_insult.shape

In [None]:
# We check how many comments there are per category: identity hate
nb_id_hate = train2[train2.identity_hate==1]
nb_id_hate.shape

In [None]:
test_labels.head()

In [None]:
test2.head()

Cleaning data

In [None]:
# We clean the train dataset
train2['clean_text'] = train2['comment_text'].apply(cleaning_text)
train2.head(5)

In [None]:
# We look at observations with severe toxic being equal to 1
train2[train2.severe_toxic==1]

# We observe that all the severe toxic comments are also considered as toxic as they are both equal to 1

In [None]:
# We clean the test dataset
test2['clean_text'] = test2['comment_text'].apply(cleaning_text)
test2.head(5)

### LR

In [None]:
# New variable with the score (categories sum)

# In top of that we add more weights to the comments that are severe toxic, threatning and have identity hate.
train2['score'] = train2.toxic + 3*train2.severe_toxic + train2.obscene + 3*train2.threat + train2.insult + 3*train2.identity_hate
train2.head()

In [None]:
# We keep only the clean text and the score
train2 = train2[["clean_text", "score"]]
train2

In [None]:
len(train2)

In [None]:
X = train2['clean_text']
Y = train2['score']

## TF-IDF

In [None]:
# we take unigram and bigram
vectorizer = TfidfVectorizer(analyzer = 'char_wb', min_df = 5, smooth_idf=True, ngram_range=(1,2))

X_train_tfidf = vectorizer.fit_transform(X)

### Model

In [None]:
# Model
model = LinearRegression()
model.fit(X_train_tfidf,train2['score'])

In [None]:
# Predicting on the comments_to_score
comments_to_score_tfidf = vectorizer.transform(comments_to_score['clean_text'])

pred = model.predict(comments_to_score_tfidf)
print(pred)

### Submission

In [None]:
# Exporting the submission file
results = pd.DataFrame(comments_to_score["comment_id"])
results["score"] = pred

In [None]:
results.head(5)

In [None]:
results.to_csv("submission.csv", index=False)