In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd

import copy
import traceback
import datetime
import joblib
import re
import os
import random
import string
import time

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
%matplotlib inline

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torchtext import data
from torchtext import datasets
from torchtext import vocab
from torchtext.vocab import Vectors, GloVe

from tqdm.notebook import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import pymorphy2
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
class ConfigExperiment:
    seed = 42
    positive_file = "../data/positive.csv"
    negative_file = "../data/negative.csv"
    test_size = 0.3
    device = "cuda" if torch.cuda.is_available() else "cpu"
    embed_dim = 300
    max_vocab_size = 100_000
    batch_size = 64
    num_epochs = 30
    lr = 1e-2
    num_workers = 0
    patience = 3
    early_stopping_delta = 1e-4
    save_dirname = "models"
    
config = ConfigExperiment()

In [4]:
def init_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic=True
    
init_random_seed(config.seed)

In [5]:
# Формирование train test valid данных

df = pd.read_csv("../data/preprocessed_text_v1.csv", index_col=False)
df.columns = ['text', 'target']
df = df.drop(df[df['text'].map(str) == 'nan'].index)

train, validate, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])
train.to_csv("../data/train_processed_data.csv", index=False)
validate.to_csv("../data/validate_processed_data.csv", index=False)
test.to_csv("../data/test_processed_data.csv", index=False)

train.shape, validate.shape, test.shape

((135615, 2), (45205, 2), (45205, 2))

In [6]:
tokenize = lambda x: str(x).split()

TEXT = data.Field(sequential=True, tokenize=tokenize,  include_lengths=True)
LABEL = data.LabelField(dtype=torch.float)

fields = [('text',TEXT), ('label', LABEL)]

In [7]:

train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path="../data/",
                                        train="train_processed_data.csv",
                                        validation="validate_processed_data.csv",
                                        test="test_processed_data.csv",
                                        format="csv",
                                        fields=fields,
                                        skip_header=True
)

In [8]:
print(vars(train_data[0]))

{'text': ['блин', 'улица', 'плюсовой', 'температура', 'зря', 'шуба', 'доставать'], 'label': '0'}


In [9]:
print(vars(valid_data[0]))

{'text': ['час', 'спасть'], 'label': '0'}


In [10]:
print(vars(test_data[0]))

{'text': ['шикарный', 'сибирский', 'кот', 'симб', 'справиться', 'быть', 'выпускать', 'жаль'], 'label': '0'}


In [11]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 135615
Number of validation examples: 45205
Number of testing examples: 45205


In [12]:
TEXT.build_vocab(train_data, max_size=config.max_vocab_size)
LABEL.build_vocab(train_data)

In [13]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 72864
Unique tokens in LABEL vocabulary: 2


In [14]:
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    sort_key = lambda x: x.text,
    batch_size=config.batch_size,
    device=config.device)

In [15]:
# print('Train:')
# for batch in train_iterator:
#     print(batch)
    
# print('Valid:')
# for batch in valid_iterator:
#     print(batch)
    
# print('Test:')
# for batch in test_iterator:
#     print(batch)

In [16]:
print(TEXT.vocab.freqs.most_common(20))

[('хотеть', 7002), ('весь', 6334), ('день', 6002), ('мочь', 5554), ('такой', 5356), ('сегодня', 5297), ('очень', 4601), ('быть', 4362), ('ты', 4251), ('один', 4175), ('просто', 4154), ('год', 4065), ('мой', 4025), ('хороший', 3979), ('человек', 3688), ('знать', 3554), ('любить', 3349), ('завтра', 3189), ('свой', 2902), ('вообще', 2901)]


In [17]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'хотеть', 'весь', 'день', 'мочь', 'такой', 'сегодня', 'очень', 'быть']


In [18]:
print(LABEL.vocab.stoi)

defaultdict(None, {'1': 0, '0': 1})


In [19]:
import zipfile
import gensim
import wget

model_url = 'http://vectors.nlpl.eu/repository/11/187.zip'
# wget.download(model_url)
w2v_model = gensim.models.KeyedVectors.load('187/model.model')
numpy_embeddings = np.zeros(shape=[config.max_vocab_size, config.embed_dim],dtype=np.float32)

for word in TEXT.vocab.itos:
    vector = w2v_model.get_vector(word)
    index  = TEXT.vocab.stoi[word]
    numpy_embeddings[index] = vector
    
pretrained_embeddings = torch.Tensor(numpy_embeddings).float()
pretrained_embeddings.shape

torch.Size([100000, 300])

In [20]:
type(pretrained_embeddings)

torch.Tensor

In [21]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx, pretrained_embeddings):
        super().__init__()
#         self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, padding_idx=pad_idx, freeze=False)
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, enforce_sorted=False)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden)

In [22]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = config.embed_dim
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX,
            pretrained_embeddings
           )

In [23]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 32,720,257 trainable parameters


In [24]:
pretrained_embeddings

tensor([[ 0.0306, -0.3523, -0.1600,  ..., -0.6755,  0.1621, -0.6484],
        [ 0.5594,  0.7161, -0.0522,  ..., -0.3110,  1.0994, -0.7446],
        [ 1.3671,  0.1885, -1.3516,  ...,  0.9948, -1.3568, -1.2967],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [25]:
print(pretrained_embeddings.shape)

torch.Size([100000, 300])


In [26]:
# model.embedding.weight.data.copy_(pretrained_embeddings)

In [27]:
# UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

# model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
# model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

# print(model.embedding.weight.data)

In [28]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [29]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(config.device)
criterion = criterion.to(config.device)

In [30]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [31]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        text, text_lengths = batch.text
        
        predictions = model(text, text_lengths).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [32]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.text
            
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [33]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [34]:
N_EPOCHS = config.num_epochs

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 1m 2s
	Train Loss: 0.595 | Train Acc: 67.69%
	 Val. Loss: 0.561 |  Val. Acc: 70.95%
Epoch: 02 | Epoch Time: 0m 57s
	Train Loss: 0.557 | Train Acc: 71.01%
	 Val. Loss: 0.556 |  Val. Acc: 71.85%
Epoch: 03 | Epoch Time: 1m 0s
	Train Loss: 0.531 | Train Acc: 73.09%
	 Val. Loss: 0.557 |  Val. Acc: 72.32%
Epoch: 04 | Epoch Time: 1m 1s
	Train Loss: 0.509 | Train Acc: 74.71%
	 Val. Loss: 0.549 |  Val. Acc: 72.81%
Epoch: 05 | Epoch Time: 1m 1s
	Train Loss: 0.490 | Train Acc: 76.03%
	 Val. Loss: 0.588 |  Val. Acc: 72.52%
Epoch: 06 | Epoch Time: 1m 2s
	Train Loss: 0.473 | Train Acc: 77.17%
	 Val. Loss: 0.586 |  Val. Acc: 72.53%
Epoch: 07 | Epoch Time: 1m 0s
	Train Loss: 0.457 | Train Acc: 78.06%
	 Val. Loss: 0.608 |  Val. Acc: 72.10%
Epoch: 08 | Epoch Time: 1m 0s
	Train Loss: 0.445 | Train Acc: 78.88%
	 Val. Loss: 0.602 |  Val. Acc: 72.06%
Epoch: 09 | Epoch Time: 1m 2s
	Train Loss: 0.434 | Train Acc: 79.55%
	 Val. Loss: 0.626 |  Val. Acc: 72.06%
Epoch: 10 | Epoch Time: 1m 

KeyboardInterrupt: 

In [35]:
model.load_state_dict(torch.load('tut2-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.545 | Test Acc: 72.83%
