In [390]:
import youtokentome as yttm

import torch
from torch import nn

import torch.nn.functional as F

import pandas as pd

In [391]:
with open('train2.csv', 'r') as data_file:
    with open('train.csv', 'a') as output_file:
        for line in data_file:
            output_file.write(" ".join(line.split()).lower())

In [392]:
train_data_path = "train.csv"
model_path = "example.model"

# Training model
a = yttm.BPE.train(data=train_data_path, vocab_size=5000, model=model_path)

# Loading model
bpe = yttm.BPE(model=model_path)

Training parameters
  input: train.csv
  model: example.model
  vocab_size: 5000
  n_threads: 8
  character_coverage: 1
  pad: 0
  unk: 1
  bos: 2
  eos: 3

reading file...
learning bpe...
number of unique characters in the training data: 61
number of deleted characters: 0
number of unique characters left: 61
id: 1000=455+174              freq: 608         subword: ▁silent=▁sil+ent
id: 2000=95+528               freq: 208         subword: asure=as+ure
id: 3000=77+2363              freq: 112         subword: ▁funn=▁f+unn
id: 4000=389+20               freq: 72          subword: ▁unm=▁un+m
model saved to: example.model


In [393]:
df = pd.read_csv("train_toxicity.csv")
df.head()

Unnamed: 0,id,comment_text,toxic
0,e617e2489abe9bca,"""\r\n\r\n A barnstar for you! \r\n\r\n The De...",0
1,9250cf637294e09d,"""\r\n\r\nThis seems unbalanced. whatever I ha...",0
2,ce1aa4592d5240ca,"Marya Dzmitruk was born in Minsk, Belarus in M...",0
3,48105766ff7f075b,"""\r\n\r\nTalkback\r\n\r\n Dear Celestia... """,0
4,0543d4f82e5470b6,New Categories \r\n\r\nI honestly think that w...,0


In [399]:
toxic_count = 0
for condition in df["toxic"]:
    toxic_count += condition
print("Dataset size %.3d \nToxic comments amount %.3d" % (len(df), toxic_count))

Dataset size 5000 
Toxic comments amount 437


In [400]:
from sklearn.model_selection import train_test_split

train, valid = train_test_split(df, test_size=0.2)
train, test = train_test_split(train, test_size=0.25)

In [560]:
N = 100

def encode(q):
    text = bpe.encode(q, output_type=yttm.OutputType.ID)
    
    enc = np.zeros(N, dtype=np.int32)
    l = min(N, len(text))
    enc[N-l:] = text[:l]
    text = enc[:]
        
    return np.array(text)

In [561]:
import numpy as np
from torch.utils.data import Dataset


class QuoraDataset(Dataset):
    def __init__(self, df):
        self.text = [encode(" ".join(q.split()).lower()) for q in df["comment_text"]]
        self.y = list(df["toxic"])
        
    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        x = self.text[idx]
        y = self.y[idx]
        return x, np.array([y])

In [562]:
def collate():
    return torch.Tensor

In [563]:
train_ds = QuoraDataset(train)
valid_ds = QuoraDataset(valid)
test_ds = QuoraDataset(test)

In [564]:
from torch.utils.data import DataLoader

batch_size = 64
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=True)

In [739]:
class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super(TextClassificationModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True, num_layers=2)
        self.linear = nn.Linear(2*hidden_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x_emb = self.embeddings(x)
        x_drop = self.dropout(x_emb)
        out_pack, (ht, ct) = self.lstm(x_drop)
        ht = torch.cat((ht[-2, :, :], ht[-1, :, :]), dim=1)
        out = self.linear(ht)
        out = self.dropout(out)
        return self.linear2(F.relu(out))

In [740]:
def val_metrics(model, dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, y in dl:
        y = y.float()
        y_model = model(x)
        loss = F.binary_cross_entropy_with_logits(y_model, y)
        y_pred = y_model > 0
        correct += (y_pred.float() == y).float().sum()
        total += batch_size
        sum_loss += loss.item()
    return sum_loss/total, correct/total

In [741]:
def train_epocs(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y in train_dl:
            y = y.float()
            y_pred = model(x)
            optimizer.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()
            total += batch_size
        val_loss, val_acc = val_metrics(model, valid_dl)
        if i % 5 == 1:
            print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [742]:
model = TextClassificationModel(5000, 50, 50)

In [743]:
train_epocs(model, epochs=50, lr=0.01)

train loss 0.004 val loss 0.004 and val accuracy 0.890
train loss 0.002 val loss 0.005 and val accuracy 0.903
train loss 0.001 val loss 0.009 and val accuracy 0.910
train loss 0.001 val loss 0.006 and val accuracy 0.908
train loss 0.001 val loss 0.008 and val accuracy 0.910
train loss 0.000 val loss 0.010 and val accuracy 0.914
train loss 0.000 val loss 0.010 and val accuracy 0.913
train loss 0.000 val loss 0.007 and val accuracy 0.907
train loss 0.000 val loss 0.009 and val accuracy 0.911
train loss 0.000 val loss 0.010 and val accuracy 0.914


In [744]:
test_loss, test_acc = val_metrics(model, test_dl)
print("test loss %.3f and test accuracy %.3f" % (test_loss, test_acc))

test loss 0.009 and test accuracy 0.922
