# Imports and Data Reading

In [12]:
import numpy as np
import pandas as pd
import os

import torch
import torchtext
from torchtext.legacy import data
from torch.utils.data import DataLoader

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext.data.utils import ngrams_iterator

import spacy # I'll be making use of spacy for text preprocessing
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import string, re

from torch.autograd import Variable
import time
import copy
from torch.optim import lr_scheduler
from torch.utils.data.dataset import random_split

from sklearn.model_selection import train_test_split
from torchtext.vocab import Vectors, GloVe
from matplotlib.pyplot import plot, hist, xlabel, legend
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns

from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

import logging
import warnings
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt='%H:%M:%S', level=logging.INFO)
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('/content/dataset/train.csv')
test = pd.read_csv('/content/dataset/test.csv')
sub = pd.read_csv('/content/dataset/submission.csv')

train.drop_duplicates(subset=['comment_text'], inplace=True)
print('Training Set Shape = {}'.format(train.shape))
print('Training Set Memory Usage = {:.2f} MB'.format(train.memory_usage().sum() / 1024**2))
print('Test Set Shape = {}'.format(test.shape))
print('Test Set Memory Usage = {:.2f} MB'.format(test.memory_usage().sum() / 1024**2))
print('\n\tFirst five rows of our data:\n')
print(train.head())

Training Set Shape = (34581, 3)
Training Set Memory Usage = 1.06 MB
Test Set Shape = (9194, 2)
Test Set Memory Usage = 0.14 MB

	First five rows of our data:

   id                                       comment_text  toxicity
0   0                 fuck you you self righteous creep          3
1   1   stop stop the goddam vandalism or there ll be...         2
2   2  i agree rt does have a few shortcomings  but i...         0
3   3  if you would like verfiability here is the lin...         0
4   4  do you think there s consensus for me to be on...         0


### Missing Values

In [3]:
train = train.dropna()

In [4]:
train = train.rename(columns={"comment_text": "text", "toxicity": "target"})
test = test.rename(columns={"comment_text": "text"})

In [5]:
test

Unnamed: 0,id,text
0,34647,oh that great repository of free cultural work...
1,34648,my rfa with apologies for the impersonal awb ...
2,34649,it looks like a number of articles you created...
3,34650,oh but i see you ve been block for other s...
4,34651,accord of the discussion in mariah carey compo...
...,...,...
9189,43836,atat rk you cannot escape atat rk s racial s...
9190,43837,irresponsible dumheads each and every image h...
9191,43838,i agrre with above and i checked and in shia s...
9192,43839,i think there should be some form of screening...


# Model Building 

In [6]:
df_train = train['text']
df_test = test['text']

In [7]:
# Text preprocessing with spacy

def clean_text(text):
    return text.strip().lower()

punctuations = string.punctuation
nlp = spacy.load('en')
stop_words = STOP_WORDS
parser = English()

def spacy_preprocessor(sentence):
    """Tokenize, Lemmatize, Remove Stopwords"""
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    result = ' '.join(mytokens)
    return result

df_train = df_train.apply(lambda x: spacy_preprocessor(x))
df_test = df_test.apply(lambda x: spacy_preprocessor(x))

In [8]:
train['text'] = df_train
test['text'] = df_test

In [9]:
X = train[[col for col in train.columns if not col == 'target']]
y = train['target']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=20)

train_data = pd.concat([X_train, y_train], axis=1)
valid_data = pd.concat([X_valid, y_valid], axis=1)

# save them
!mkdir preprocessed_data
train_data.to_csv('preprocessed_data/train.csv', index=False)
valid_data.to_csv('preprocessed_data/valid.csv', index=False)
test.to_csv('preprocessed_data/test.csv', index=False)

In [10]:
is_cuda = torch.cuda.is_available()
print(f"Cuda Status on the system is {is_cuda}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Cuda Status on the system is True


In [13]:
# choose a fixed length to process the string
fix_length = 17
text = data.Field(tokenize="spacy",
                  pad_first=True)

# define train, validation and test sets
train_data = data.TabularDataset(path="preprocessed_data/train.csv",
                                 format="csv",
                                 fields=[
                                         ('id', data.Field()),
                                         ('text', text),
                                         ('target', data.Field())],
                                 skip_header=True)

valid_data = data.TabularDataset(path="preprocessed_data/valid.csv",
                                 format="csv",
                                 fields=[
                                         ('id', data.Field()),
                                         ('text', text),
                                         ('target', data.Field())],
                                 skip_header=True)

test_data = data.TabularDataset(path="preprocessed_data/test.csv",
                                format="csv",
                                fields=[
                                        ('id', data.Field()),
                                        ('text', text),
                                        ],
                                skip_header=True)

text.build_vocab(train_data, valid_data)

In [14]:
VOCAB_SIZE = len(text.vocab)
NGRAMS = 2
BATCH_SIZE = 8
EMBED_DIM = 32
NUM_CLASS = 6

In [15]:
class TextSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        # initialize the weights
        self.init_weights()
        
    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
        
    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [16]:
model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUM_CLASS).to(device)

In [17]:
def generate_batch(batch):
    label = torch.tensor([int(entry.target[0]) for entry in batch])
    _text = []
    for entry in batch:
        _entry = []
        for t in entry.text:
            _entry.append(text.vocab.stoi[t])
        _text.append(torch.tensor(_entry,dtype=torch.long))
    offsets = [0] + [len(entry) for entry in _text]
    # torch.Tensor.cumsum returns the cumulative sum
    # of elements in the dimension dim.
    # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    _text = torch.cat(_text)
    return _text, offsets, label

In [18]:
def train_func(sub_train_):
    # Train the model
    train_loss = 0
    train_acc = 0
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)
    for i, (text, offsets, cls) in enumerate(data):
        optimizer.zero_grad()
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        output = model(text, offsets)
        loss = criterion(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == cls).sum().item()
    # Adjust the learning rate
    scheduler.step()
    return train_loss / len(sub_train_), train_acc / len(sub_train_)

def test(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    for text, offsets, cls in data:
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text, offsets)
            loss = criterion(output, cls)
            loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()
    return loss / len(data_), acc / len(data_)

In [19]:
N_EPOCHS = 20
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

train_len = int(len(train_data) * 0.95)
sub_train_, sub_valid_ = random_split(train_data, [train_len, len(train_data) - train_len])

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train_func(sub_train_)
    valid_loss, valid_acc = test(sub_valid_)
    
    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60
    
    print(f'Epoch: {epoch + 1}, | time in {mins} minutes and {secs} seconds')
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1, | time in 0.08333333333333333 minutes and 5 seconds
	Loss: 0.1280(train)	|	Acc: 59.7%(train)
	Loss: 0.0013(valid)	|	Acc: 63.3%(valid)
Epoch: 2, | time in 0.08333333333333333 minutes and 5 seconds
	Loss: 0.1090(train)	|	Acc: 65.6%(train)
	Loss: 0.0010(valid)	|	Acc: 64.7%(valid)
Epoch: 3, | time in 0.08333333333333333 minutes and 5 seconds
	Loss: 0.1002(train)	|	Acc: 68.6%(train)
	Loss: 0.0010(valid)	|	Acc: 63.8%(valid)
Epoch: 4, | time in 0.08333333333333333 minutes and 5 seconds
	Loss: 0.0931(train)	|	Acc: 71.1%(train)
	Loss: 0.0013(valid)	|	Acc: 61.8%(valid)
Epoch: 5, | time in 0.08333333333333333 minutes and 5 seconds
	Loss: 0.0868(train)	|	Acc: 73.4%(train)
	Loss: 0.0012(valid)	|	Acc: 63.9%(valid)
Epoch: 6, | time in 0.08333333333333333 minutes and 5 seconds
	Loss: 0.0815(train)	|	Acc: 75.9%(train)
	Loss: 0.0011(valid)	|	Acc: 63.7%(valid)
Epoch: 7, | time in 0.08333333333333333 minutes and 5 seconds
	Loss: 0.0764(train)	|	Acc: 77.6%(train)
	Loss: 0.0009(valid)	|	Acc: 62.9%

In [20]:
def predict(_text, model, vocab, ngrams):
    if len(_text) == 0:
        return 0
    with torch.no_grad():
        _text = [vocab.stoi[token] for token in ngrams_iterator(_text, ngrams)]
        output = model(torch.tensor(_text), torch.tensor([0]))
        return output.argmax(1).item()

model = model.to('cpu')
predictions = [predict(entry.text, model, text.vocab, NGRAMS) for entry in test_data]
tweet_id = [entry.id[0] for entry in test_data]

In [21]:
output = pd.DataFrame({'id': tweet_id, 'target': predictions})
output.to_csv('my_submission.csv', index=False)