In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd

import copy
import traceback
import datetime
import joblib
import re
import os
import random
import string
import time

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
%matplotlib inline

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torchtext import data
from torchtext import datasets
from torchtext import vocab
from torchtext.vocab import Vectors, GloVe

from tqdm.notebook import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import pymorphy2
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
class ConfigExperiment:
    seed = 42
    positive_file = "../data/positive.csv"
    negative_file = "../data/negative.csv"
    test_size = 0.3
    device = "cuda" if torch.cuda.is_available() else "cpu"
    embed_dim = 200
    max_vocab_size = 25_000
    batch_size = 64
    num_epochs = 50
    lr = 1e-2
    num_workers = 0
    patience = 3
    early_stopping_delta = 1e-4
    save_dirname = "models"
    
config = ConfigExperiment()

In [4]:
def init_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic=True
    
init_random_seed(config.seed)

In [5]:
# Формирование train test valid данных

df = pd.read_csv("../data/preprocessed_text_v1.csv", index_col=False)
df.columns = ['text', 'target']
df = df.drop(df[df['text'].map(str) == 'nan'].index)

train, validate, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])
train.to_csv("../data/train_processed_data.csv", index=False)
validate.to_csv("../data/validate_processed_data.csv", index=False)
test.to_csv("../data/test_processed_data.csv", index=False)

train.shape, validate.shape, test.shape

((135615, 2), (45205, 2), (45205, 2))

In [6]:
tokenize = lambda x: str(x).split()

TEXT = data.Field(sequential=True, tokenize=tokenize)
LABEL = data.LabelField(dtype=torch.float)

fields = [('text',TEXT), ('label', LABEL)]

In [7]:

train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path="../data/",
                                        train="train_processed_data.csv",
                                        validation="validate_processed_data.csv",
                                        test="test_processed_data.csv",
                                        format="csv",
                                        fields=fields,
                                        skip_header=True
)

In [8]:
print(vars(train_data[0]))

{'text': ['блин', 'улица', 'плюсовой', 'температура', 'зря', 'шуба', 'доставать'], 'label': '0'}


In [9]:
print(vars(valid_data[0]))

{'text': ['час', 'спасть'], 'label': '0'}


In [10]:
print(vars(test_data[0]))

{'text': ['шикарный', 'сибирский', 'кот', 'симб', 'справиться', 'быть', 'выпускать', 'жаль'], 'label': '0'}


In [11]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 135615
Number of validation examples: 45205
Number of testing examples: 45205


In [12]:
TEXT.build_vocab(train_data, max_size=config.max_vocab_size)
LABEL.build_vocab(train_data)

In [13]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [14]:
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    sort_key = lambda x: x.text,
    batch_size=config.batch_size,
    device=config.device)

In [15]:
print('Train:')
for batch in train_iterator:
    print(batch)
    
print('Valid:')
for batch in valid_iterator:
    print(batch)
    
print('Test:')
for batch in test_iterator:
    print(batch)

Train:

[torchtext.data.batch.Batch of size 64]
	[.text]:[torch.cuda.LongTensor of size 17x64 (GPU 0)]
	[.label]:[torch.cuda.FloatTensor of size 64 (GPU 0)]

[torchtext.data.batch.Batch of size 64]
	[.text]:[torch.cuda.LongTensor of size 15x64 (GPU 0)]
	[.label]:[torch.cuda.FloatTensor of size 64 (GPU 0)]

[torchtext.data.batch.Batch of size 64]
	[.text]:[torch.cuda.LongTensor of size 14x64 (GPU 0)]
	[.label]:[torch.cuda.FloatTensor of size 64 (GPU 0)]

[torchtext.data.batch.Batch of size 64]
	[.text]:[torch.cuda.LongTensor of size 14x64 (GPU 0)]
	[.label]:[torch.cuda.FloatTensor of size 64 (GPU 0)]

[torchtext.data.batch.Batch of size 64]
	[.text]:[torch.cuda.LongTensor of size 17x64 (GPU 0)]
	[.label]:[torch.cuda.FloatTensor of size 64 (GPU 0)]

[torchtext.data.batch.Batch of size 64]
	[.text]:[torch.cuda.LongTensor of size 15x64 (GPU 0)]
	[.label]:[torch.cuda.FloatTensor of size 64 (GPU 0)]

[torchtext.data.batch.Batch of size 64]
	[.text]:[torch.cuda.LongTensor of size 14x64 (GPU 0

In [16]:
print(TEXT.vocab.freqs.most_common(20))

[('хотеть', 7002), ('весь', 6334), ('день', 6002), ('мочь', 5554), ('такой', 5356), ('сегодня', 5297), ('очень', 4601), ('быть', 4362), ('ты', 4251), ('один', 4175), ('просто', 4154), ('год', 4065), ('мой', 4025), ('хороший', 3979), ('человек', 3688), ('знать', 3554), ('любить', 3349), ('завтра', 3189), ('свой', 2902), ('вообще', 2901)]


In [17]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'хотеть', 'весь', 'день', 'мочь', 'такой', 'сегодня', 'очень', 'быть']


In [18]:
print(LABEL.vocab.stoi)

defaultdict(None, {'1': 0, '0': 1})


In [19]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        #text = [sent len, batch size]
        embedded = self.embedding(text)
        #embedded = [sent len, batch size, emb dim]
        output, hidden = self.rnn(embedded)
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        return self.fc(hidden.squeeze(0))

In [20]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = config.embed_dim
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [21]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 5,117,905 trainable parameters


In [22]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [23]:
criterion = nn.BCEWithLogitsLoss()

In [24]:
model = model.to(config.device)
criterion = criterion.to(config.device)

In [25]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [26]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [27]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [28]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [29]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 11s
	Train Loss: 0.694 | Train Acc: 50.34%
	 Val. Loss: 0.694 |  Val. Acc: 49.88%
Epoch: 02 | Epoch Time: 0m 11s
	Train Loss: 0.694 | Train Acc: 50.23%
	 Val. Loss: 0.694 |  Val. Acc: 50.53%
Epoch: 03 | Epoch Time: 0m 10s
	Train Loss: 0.693 | Train Acc: 50.69%
	 Val. Loss: 0.693 |  Val. Acc: 50.23%
Epoch: 04 | Epoch Time: 0m 8s
	Train Loss: 0.693 | Train Acc: 50.57%
	 Val. Loss: 0.693 |  Val. Acc: 50.31%
Epoch: 05 | Epoch Time: 0m 10s
	Train Loss: 0.693 | Train Acc: 50.75%
	 Val. Loss: 0.693 |  Val. Acc: 50.38%
Epoch: 06 | Epoch Time: 0m 10s
	Train Loss: 0.693 | Train Acc: 50.51%
	 Val. Loss: 0.693 |  Val. Acc: 50.37%
Epoch: 07 | Epoch Time: 0m 10s
	Train Loss: 0.693 | Train Acc: 50.81%
	 Val. Loss: 0.693 |  Val. Acc: 50.35%
Epoch: 08 | Epoch Time: 0m 10s
	Train Loss: 0.693 | Train Acc: 50.73%
	 Val. Loss: 0.693 |  Val. Acc: 52.39%
Epoch: 09 | Epoch Time: 0m 10s
	Train Loss: 0.693 | Train Acc: 50.86%
	 Val. Loss: 0.693 |  Val. Acc: 50.37%
Epoch: 10 | Epoch Ti

In [30]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.693 | Test Acc: 50.47%
