### [Kaggle Toxic Comment Classification](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)

Identify and classify toxic online comments

In [51]:
import random, re, string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import spacy
from torchtext import data
from torchtext.vocab import GloVe

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

%matplotlib inline

# Data Preprocessing

In [3]:
# # Read Training and Test Data
# train = pd.read_csv('./toxic-comment/raw_train.csv')
# test = pd.read_csv('./toxic-comment/raw_test.csv')

# train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [9]:
# # concat individual labels into single label
# def process_label(df):
#     label = []
#     for name in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
#         label.append(df[name])
        
#     return np.array(label)

# train['label'] = train.apply(process_label, axis=1)
# test['label'] = test.apply(process_label, axis=1)

# train = train[['comment_text', 'label']]
# test = test[['comment_text', 'label']]

# train.head()

Unnamed: 0,comment_text,label
0,Explanation\nWhy the edits made under my usern...,"[0, 0, 0, 0, 0, 0]"
1,D'aww! He matches this background colour I'm s...,"[0, 0, 0, 0, 0, 0]"
2,"Hey man, I'm really not trying to edit war. It...","[0, 0, 0, 0, 0, 0]"
3,"""\nMore\nI can't make any real suggestions on ...","[0, 0, 0, 0, 0, 0]"
4,"You, sir, are my hero. Any chance you remember...","[0, 0, 0, 0, 0, 0]"


In [10]:
# # save to disk
# train.to_csv('./toxic-comment/train.csv', index=False)
# test.to_csv('./toxic-comment/test.csv', index=False)

# Create Dataset and Dataloader

In [122]:
# define custom tokenizer
# ref: https://towardsdatascience.com/use-torchtext-to-load-nlp-datasets-part-i-5da6f1c89d84
# ref: https://zhuanlan.zhihu.com/p/34722385
NLP = spacy.load('en')

def tokenizer(comment):
    comment = re.sub(r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;.#$%&;@%!,?]", " ", str(comment))
    comment = re.sub(r"[ ]+", " ", comment)
    comment = re.sub(r"\!+", " ", comment)
    comment = re.sub(r"\,+", " ", comment)
    comment = re.sub(r"\?+", " ", comment)
    
    tokens = [x.text for x in NLP.tokenizer(comment) if x.text != " "]
    
    return tokens

In [123]:
# define data fields
TEXT = data.Field(sequential=True, lower=True, tokenize=tokenizer, batch_first=True, 
                  pad_first=True, is_target=False)
LABELS = data.Field(sequential=False, use_vocab=False, batch_first=True, is_target=True)

In [124]:
# create dataset
fields = [('id', None), ('comment_text', TEXT), ('toxic', LABELS), ('severe_toxic', LABELS), 
          ('obscene', LABELS), ('threat', LABELS), ('insult', LABELS), ('identity_hate', LABELS)] 

train = data.TabularDataset(path='./toxic-comment/raw_train.csv', format='CSV', fields=fields, skip_header=True)

train, val = train.split(split_ratio=0.8)

test = data.TabularDataset(path='./toxic-comment/raw_test.csv', format='CSV', fields=fields, skip_header=True)

In [125]:
# build Vocab
TEXT.build_vocab(train, val, test, vectors=GloVe(name='6B', dim=100))

In [126]:
# build iterator
train_iter, val_iter = data.BucketIterator.splits((train, val), batch_sizes=(64, 64), repeat=False, 
                                                  sort_key=lambda x: len(x.comment_text), 
                                                  shuffle=True, sort_within_batch=False)

test_iter = data.Iterator(test, batch_size=64, train=False, repeat=False, shuffle=False, sort=False)

In [127]:
for vals in train_iter:
    break

In [134]:
vals.toxic.shape

torch.Size([64])

# Build Model

# Model Training and Evaluation