## Here we are treating it as a Classification problem.

## The final score is the probability predicted by the Model.

In [1]:
# Importing libraries

import math
import os
import random
import numpy as np
import pandas as pd
import re
import unidecode

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from imblearn.under_sampling import RandomUnderSampler

import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, GRU, Embedding, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# Defining constants

voc_size = 50000
max_sequence_length = 250
embedding_dim = 100
Batch_size = 16

train_prev_comp = "jigsaw-toxic-comment-train.csv"
test_cur_comp = "comments_to_score.csv"


def seed_everything():
    np.random.seed(123)
    random.seed(123)
    tf.random.set_seed(123)
    os.environ["TF_CPP_MIN_LOG_LEVEL"] = '2'
    os.environ['PYTHONHASHSEED'] = str(123)

seed_everything()

In [3]:
# Function for cleaning comments

def clean_data(data):
    final = []
    for sent in data:
        sent = sent.replace('\\n', ' ').replace('\n', ' ').replace('\t',' ').replace('\\', ' ').replace('. com', '.com')
        soup = BeautifulSoup(sent, "html.parser")
        sent = soup.get_text(separator=" ")
        remove_https = re.sub(r'http\S+', '', sent)
        sent = re.sub(r"\ [A-Za-z]*\.com", " ", remove_https)
        sent = unidecode.unidecode(sent)
        sent = sent.lower()
        sent = re.sub(r"[^a-zA-Z0-9:$-,()%.?!]+", ' ', sent) 
        sent = re.sub(r"[:$-,()%.?!]+", ' ',sent)
        stoplist = stopwords.words("english")
        sent = [word for word in word_tokenize(sent) if word not in stoplist]
        sent = " ".join(sent)
        final.append(sent)
    
    return final

In [4]:
# Reading train file from previous competition

df = pd.read_csv(train_prev_comp)


df["y"] = (df[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].sum(axis=1) > 0).astype(int)
df.drop(["id","toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"], axis=1, inplace = True)
df.head()

Unnamed: 0,comment_text,y
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [5]:
# Seeing that dataset is imbalanced

df["y"].value_counts()

0    201081
1     22468
Name: y, dtype: int64

In [6]:
# Balacing dataset

X = np.array(df["comment_text"].values)
X = X.reshape(-1,1)
y = np.array(df["y"].values)
rus = RandomUnderSampler(random_state=0)
x, y = rus.fit_resample(X, y)

x = x.flatten()
df = pd.DataFrame()
df["text"] = x
df["target"] = y


# Now its balanced

df["target"].value_counts()

0    22468
1    22468
Name: target, dtype: int64

In [7]:
# Creating column clean_text for cleaned comments

df["text"] = clean_data(df["text"])
df.head()



Unnamed: 0,text,target
0,tyrrell head nutcase,0
1,sockpuppet template give comment made laugh pu...,0
2,similar arguments made local cultural traditio...,0
3,invitation take part study wikipedian studying...,0
4,hier hast du du bloder affe,0


In [8]:
# Defining keras Model with GRU units

class GRU_model(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.Embedding = Embedding(voc_size, embedding_dim, input_length = max_sequence_length)
        self.GRU1 = Bidirectional(GRU(128, return_sequences=True))
        self.Dropout1 = Dropout(0.25)
        self.GRU2 = Bidirectional(GRU(64, return_sequences = False))
        self.Dropout2 = Dropout(0.25)
        self.Dense1 = Dense(64, activation="relu")
        self.Dropout3 = Dropout(0.2)
        self.Dense2 = Dense(1, activation="sigmoid")
    
    def call(self, inputs):
        x = self.Embedding(inputs)
        x = self.GRU1(x)
        x = self.Dropout1(x)
        x = self.GRU2(x)
        x = self.Dropout2(x)
        x = self.Dense1(x)
        x = self.Dropout3(x)
        x = self.Dense2(x)
        
        return x

In [9]:
# Using early_stopping as callback function 
# It takes the weigths of epoch with the best val_accuracy

early_stopping = EarlyStopping(patience = 5,restore_best_weights = True)

In [10]:
# Tokenizing the comments from train dataset

tokenizer = Tokenizer(num_words = voc_size)
tokenizer.fit_on_texts(df["text"].values)
X = tokenizer.texts_to_sequences(df["text"].values)
X = pad_sequences(X, maxlen = max_sequence_length)

In [11]:
model = GRU_model()
model.compile(
        loss = tf.keras.losses.BinaryCrossentropy(),
        optimizer = "Adam",
        metrics = ["accuracy"]
    )


model.fit(
        X, 
        df.target, 
        epochs = 10, 
        validation_split = 0.2,
        batch_size = Batch_size, 
        callbacks = [early_stopping]
    )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


<keras.callbacks.History at 0x1d2850af670>

In [12]:
# Reading given test dataset 

test = pd.read_csv(test_cur_comp)

test["text"] = clean_data(test["text"])
x_test = tokenizer.texts_to_sequences(test["text"].values)
x_test = pad_sequences(x_test, maxlen = max_sequence_length)

pred = model.predict(x_test)





In [13]:
# Making submission file

final = pd.DataFrame()
final["comment_id"] = test["comment_id"]
final["score"] = pred
final.to_csv("submission.csv", index=False)

In [14]:
final.head()

Unnamed: 0,comment_id,score
0,114890,0.007382
1,732895,0.020859
2,1139051,0.018533
3,1434512,0.176521
4,2084821,0.84216
