In [189]:
from nltk.corpus import stopwords
import nltk
import pandas as pd
import numpy as np
import itertools
from nltk.stem import WordNetLemmatizer
import torch
from torch import optim
from torch import nn
from tqdm import tqdm, trange
import gensim.downloader
from gensim.models import Word2Vec
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split 
import re
from sklearn.metrics import accuracy_score

In [190]:
class Dataset(torch.utils.data.Dataset):
  # Characterizes a dataset for PyTorch
  def __init__(self, list_IDs, labels):
        'Initialization'
        self.labels = labels
        self.list_IDs = list_IDs

  def __len__(self):
        'Denotes the total number of samples'
        return len(self.list_IDs)

  def __getitem__(self, index):
        'Generates one sample of data'
        
        X = self.list_IDs[index]
        y = self.labels[index]

        return X.float(), y.float()
    
def predict(dataloader, model):
    model.eval()
    predictions = np.array([])
    for x_batch, _ in dataloader:
        
        preds = model(x_batch)
        predictions = np.hstack((predictions, preds.detach().numpy().flatten()))
    return predictions.flatten()

In [191]:
stop_words = set(stopwords.words('english'))

In [192]:
df = pd.read_csv("sentiment.csv", index_col="id")
df

Unnamed: 0_level_0,comment_text,toxicity
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,fuck you you self righteous creep,3
1,stop stop the goddam vandalism or there ll be...,2
2,i agree rt does have a few shortcomings but i...,0
3,if you would like verfiability here is the lin...,0
4,do you think there s consensus for me to be on...,0
...,...,...
34642,huy i am kyle robbins i think albert pujols is...,4
34643,unlike the ancient greeks the idiot who bloc...,4
34644,no it isn t so if you wish to delete my accou...,1
34645,you haven t been paying attention i don t c...,1


In [193]:
np.mean([len(str(df["comment_text"][i]).split()) for i in range(0, len(df))])

59.62501803907986

In [194]:
def data_preporation_and_corpus(data: np.array, max_legth: int):
    data_prepeared = []
    lemmatizer = WordNetLemmatizer()
    for index in trange(0, len(data)):
        line = data[index]
        line = str(line)
        line = line.lower()
        splited = re.split(" ", line)
        splited = np.array([i for i in splited if (i not in stop_words) and (i not in '')])
        for element in range(0, len(splited)):
            splited[element] = lemmatizer.lemmatize(splited[element])
        
        data_prepeared.append(splited)
        
    data_prepeared_pd = pd.DataFrame(data_prepeared)
    data_prepeared_pd.fillna('@None', inplace=True)
    data_prepeared_pd = data_prepeared_pd.iloc[:, :max_legth]
    data_prepeared_pd = data_prepeared_pd.to_numpy()
    return data_prepeared_pd
 
def word2vec_tranform(data, model):
    data_null = np.zeros((data.shape[0], data.shape[1], 100))
    
    for column in trange(0, data.shape[0]):
        for index in range(0, data.shape[1]):
            try: word_embedding = model.get_vector(data[column][index])
            except: word_embedding = model.get_vector('@None')
            for element in range(0, word_embedding.shape[0]):
                
                data_null[column][index][element] = word_embedding[element]  

    return data_null

def to_torch_transformation(data:np.array):
    data = torch.from_numpy(data)
    return data
    
def create_vocab(data):
    main = []
    for line in data:
        main.append(line.tolist())
        
    return list(set(list(itertools.chain.from_iterable(main))))

In [195]:
data_prepeared = data_preporation_and_corpus(df['comment_text'].to_numpy(), 60)
data_prepeared

100%|███████████████████████████████████████████████| 34647/34647 [00:03<00:00, 10276.12it/s]


array([['fuck', 'self', 'righteous', ..., '@None', '@None', '@None'],
       ['stop', 'stop', 'goddam', ..., '@None', '@None', '@None'],
       ['agree', 'rt', 'shortcoming', ..., 'choice', 'avoiding',
        'nonsense'],
       ...,
       ['wish', 'delete', 'account', ..., '@None', '@None', '@None'],
       ['paying', 'attention', 'compromise', ..., '@None', '@None',
        '@None'],
       ['racist', 'piece', 'dirt', ..., '@None', '@None', '@None']],
      dtype=object)

In [196]:
vocab = create_vocab(data_prepeared)

In [197]:
# glove_vectors = gensim.downloader.load('glove-twitter-25')

In [198]:
w2v_model = Word2Vec(sentences=data_prepeared.tolist(),
                     min_count=1,
                     window=4,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20)

In [199]:
%%time

w2v_model.train(data_prepeared.tolist(), total_examples=len(vocab), epochs=30, report_delay=1)

CPU times: user 59.4 s, sys: 329 ms, total: 59.8 s
Wall time: 20.2 s


(17146716, 62364600)

In [200]:
data_prepeared_embedded = word2vec_tranform(data_prepeared, w2v_model.wv)

100%|█████████████████████████████████████████████████| 34647/34647 [00:52<00:00, 656.43it/s]


In [201]:
data_prepeared_embedded.shape

(34647, 60, 100)

In [202]:
data_prepeared_embedded = np.swapaxes(data_prepeared_embedded, 1, 2)

In [41]:
X_train, X_val, y_train, y_val = train_test_split(data_prepeared_embedded, df['toxicity'], test_size=0.0001, random_state=42)

In [42]:
X_train = to_torch_transformation(X_train)
X_val = to_torch_transformation(X_val)
y_train = to_torch_transformation(y_train.to_numpy())
y_val = to_torch_transformation(y_val.to_numpy())

In [43]:
X_train.shape

torch.Size([34643, 100, 60])

In [44]:
training_set = Dataset(X_train, y_train)
val_set = Dataset(X_val, y_val)

In [46]:
batch_size = 64
train_generator = DataLoader(training_set, batch_size)

In [59]:
class CNN1D(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.conv_1 = nn.Conv1d(100, 200, 3, stride=1)
        self.conv_2 = nn.Conv1d(200, 400, 3, stride=1)
        self.conv_3 = nn.Conv1d(400, 800, 3, stride=1)
        
        self.fc1 = nn.Linear(800, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 64)
        self.fc4 = nn.Linear(64, 6)
        
        self.relu = nn.LeakyReLU()
        self.soft = nn.Softmax()
        self.max_pool = nn.AvgPool1d(3)
        
    def forward(self, x):
        
        x = self.max_pool(self.relu(self.conv_1(x)))
        x = self.max_pool(self.relu(self.conv_2(x)))
        x = self.max_pool(self.relu(self.conv_3(x))).flatten(start_dim=1)

        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.fc4(x)
        x = self.soft(x)
        
        return x.float()
    

In [60]:
model = CNN1D()

lr = 0.0001
EPOCH = 10

In [61]:
optimizer = optim.Adam(model.parameters(), lr)
criterion = nn.CrossEntropyLoss()

In [62]:
for epoch in range(EPOCH):
    
    running_loss = 0.0
    for step, (x, y) in enumerate(train_generator):
        
        optimizer.zero_grad()
        outputs = model(x)

        loss = criterion(outputs, y.type(torch.LongTensor))
        loss.backward()
        optimizer.step()
        
        
        running_loss += loss.item()
        if step % 100 == 99:   
            print(f'[{epoch + 1}, {step + 1:5d}] loss: {running_loss / 200:.3f}')
            running_loss = 0.0


  x = self.soft(x)


[1,   100] loss: 0.805
[1,   200] loss: 0.788
[1,   300] loss: 0.786
[1,   400] loss: 0.788


KeyboardInterrupt: 

In [51]:
output = model(X_val.float())
output = torch.argmax(output, dim=1)
output

  x = self.soft(x)


tensor([0, 1, 1, 0])

In [52]:
accuracy_score(output.numpy(), y_val)

0.5

In [53]:
df_test = pd.read_csv("test.csv", index_col="id")
df_test

Unnamed: 0_level_0,comment_text
id,Unnamed: 1_level_1
34647,oh that great repository of free cultural work...
34648,my rfa with apologies for the impersonal awb ...
34649,it looks like a number of articles you created...
34650,oh but i see you ve been block for other s...
34651,accord of the discussion in mariah carey compo...
...,...
43836,atat rk you cannot escape atat rk s racial s...
43837,irresponsible dumheads each and every image h...
43838,i agrre with above and i checked and in shia s...
43839,i think there should be some form of screening...


In [54]:
test_prepeared = data_preporation_and_corpus(df_test['comment_text'].to_numpy(), 60)
test_prepeared

100%|█████████████████████████████████████████████████| 9194/9194 [00:00<00:00, 12438.38it/s]


array([['oh', 'great', 'repository', ..., '@None', '@None', '@None'],
       ['rfa', 'apology', 'impersonal', ..., 'categorize', 'also',
        'successfully'],
       ['look', 'like', 'number', ..., '@None', '@None', '@None'],
       ...,
       ['agrre', 'checked', 'shia', ..., '@None', '@None', '@None'],
       ['think', 'form', 'screening', ..., '@None', '@None', '@None'],
       ['suck', 'ed', 'admin', ..., '@None', '@None', '@None']],
      dtype=object)

In [55]:
data_test_embedded = word2vec_tranform(test_prepeared, w2v_model.wv)
data_test_embedded = np.swapaxes(data_test_embedded, 1, 2)

100%|███████████████████████████████████████████████████| 9194/9194 [00:13<00:00, 656.86it/s]


In [56]:
X_test = to_torch_transformation(data_test_embedded)

In [57]:
output_test = model(X_test.float())
output_test = torch.argmax(output_test, dim=1)
output_test

  x = self.soft(x)


tensor([0, 0, 0,  ..., 1, 1, 2])

In [58]:
submission = pd.read_csv('submission.csv', index_col='id')

submission['prediction'] = output_test.numpy()

submission.to_csv('my_submission.csv', index_label='id')