In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd

import copy
import traceback
import datetime
import joblib
import re
import os
import random
import string
import time

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
%matplotlib inline

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torchtext import data
from torchtext import vocab
from torchtext.vocab import Vectors, GloVe

from tqdm.notebook import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import pymorphy2
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
class ConfigExperiment:
    seed = 42
    positive_file = "../data/positive.csv"
    negative_file = "../data/negative.csv"
    russian_stop_words = "../data/russian_stop_words.txt"
    english_stop_words = "../data/english_stop_words.txt"
    test_size = 0.3
    device = "cuda" if torch.cuda.is_available() else "cpu"
    embed_dim = 100
    batch_size = 256
    num_epochs = 50
    lr = 1e-2
    num_workers = 0
    patience = 3
    early_stopping_delta = 1e-4
    save_dirname = "models"
    
config = ConfigExperiment()

In [4]:
def init_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic=True
    
init_random_seed(config.seed)

In [11]:
df = pd.read_csv("../data/preprocessed_text_v1.csv", index_col=False)
df.columns = ['text', 'target']

In [12]:
df.head()

Unnamed: 0,text,target
0,работа полный пиддес каждый закрытие месяц сви...,0
1,коллега сидеть рубиться urban terror долбать в...,0
2,говорят обещаной год ждать,0
3,желать хороший полёт удачный посадка быть очен...,0
4,обновить какой леший surf работать простоплеер,0


In [15]:
tokenize = lambda x: x.split()
t = df[df['text'].map(str) == 'nan']
t

Unnamed: 0,text,target
118,,0
1303,,0
1370,,0
2152,,0
2209,,0
...,...,...
225060,,1
225062,,1
225227,,1
225274,,1


In [16]:
df = df.drop(t.index)

In [17]:
df['text'].describe()

count                                                226025
unique                                               207965
top       офигенный день день позитив бегать идиот целый...
freq                                                    156
Name: text, dtype: object

In [20]:
train_df, test_df = train_test_split(df, random_state=config.seed, test_size=config.test_size)
train_df.to_csv("../data/train_processed_data.csv", index=False)
test_df.to_csv("../data/test_processed_data.csv", index=False)

In [21]:
tokenize = lambda x: str(x).split()
TEXT = data.Field(sequential=True, tokenize=tokenize, include_lengths=True, batch_first=True)
LABEL = data.LabelField(batch_first=True, dtype=torch.float)

fields = [('text',TEXT), ('label', LABEL)]

# train_data, valid_data = data.TabularDataset.splits(path='../data', 
#                                             format='csv', 
#                                             train='train_processed_data.csv', 
#                                             validation='test_processed_data.csv', 
#                                             fields=fields, 
#                                             skip_header=True)

# TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))
# LABEL.build_vocab(train_data)


In [22]:
class DataFrameDataset(data.Dataset):

    def __init__(self, df, fields, is_test=False, **kwargs):
        examples = []
        for i, row in df.iterrows():
            label = row.target if not is_test else None
            text = row.text
            examples.append(data.Example.fromlist([text, label], fields))

        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(ex):
        return len(ex.text)

    @classmethod
    def splits(cls, fields, train_df, val_df=None, test_df=None, **kwargs):
        train_data, val_data, test_data = (None, None, None)
        data_field = fields

        if train_df is not None:
            train_data = cls(train_df.copy(), data_field, **kwargs)
        if val_df is not None:
            val_data = cls(val_df.copy(), data_field, **kwargs)
        if test_df is not None:
            test_data = cls(test_df.copy(), data_field, True, **kwargs)

        return tuple(d for d in (train_data, val_data, test_data) if d is not None)

In [23]:
%%time

train_ds, test_ds = DataFrameDataset.splits(fields, train_df=train_df, val_df=test_df)

CPU times: user 31.8 s, sys: 168 ms, total: 32 s
Wall time: 32 s


In [24]:
type(train_ds)

__main__.DataFrameDataset

In [25]:
len(train_ds), len(test_ds)

(158217, 67808)

In [26]:
train_ds.fields.items()

dict_items([('text', <torchtext.data.field.Field object at 0x7fc94d55f100>), ('label', <torchtext.data.field.LabelField object at 0x7fc94d55fc10>)])

In [27]:
ex = train_ds[0]
type(ex)

torchtext.data.example.Example

In [28]:
ex.text

['жаль', 'какой', 'город']

In [29]:
ex.label

0

In [30]:
# Lets look at a random example
print(vars(train_ds[15]))

# Check the type 
print(type(train_ds[15]))

{'text': ['ахахааххаа', 'хороший', 'дело', 'самый', 'любимый', 'щекси', 'дэнс', 'мащина'], 'label': 1}
<class 'torchtext.data.example.Example'>


In [31]:
TEXT.build_vocab(train_ds.text)

In [32]:
LABEL.build_vocab(train_ds)

In [33]:
vocab_size = len(TEXT.vocab)
vocab_size

79462

In [34]:
TEXT.vocab.itos[:20]

['<unk>',
 '<pad>',
 'хотеть',
 'весь',
 'день',
 'мочь',
 'такой',
 'сегодня',
 'очень',
 'быть',
 'ты',
 'один',
 'мой',
 'просто',
 'год',
 'хороший',
 'человек',
 'знать',
 'любить',
 'завтра']

In [35]:
print(TEXT.vocab.freqs.most_common(10)) 

[('хотеть', 8295), ('весь', 7411), ('день', 7012), ('мочь', 6436), ('такой', 6282), ('сегодня', 6124), ('очень', 5371), ('быть', 5167), ('ты', 4976), ('один', 4846)]


In [36]:
print(list(TEXT.vocab.stoi.keys())[:20]) 

['<unk>', '<pad>', 'хотеть', 'весь', 'день', 'мочь', 'такой', 'сегодня', 'очень', 'быть', 'ты', 'один', 'мой', 'просто', 'год', 'хороший', 'человек', 'знать', 'любить', 'завтра']


In [37]:
print("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
print("Label Length: " + str(len(LABEL.vocab)))

Length of Text Vocabulary: 79462
Label Length: 2


In [38]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_ds, test_ds), 
    sort_key=lambda x: len(x.text),
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)

In [39]:
# Hyperparameters
num_epochs = 25
learning_rate = 0.0001

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.4
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] # padding

In [40]:
class LSTM_net(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(dropout)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        
        # text = [sent len, batch size]
        embedded = self.embedding(text)
        # embedded = [sent len, batch size, emb dim]

        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True)
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
#         packed_output, (hidden, cell) = self.rnn(embedded)
        
        #unpack sequence
        # output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        # output = [sent len, batch size, hid dim * num directions]
        # output over padding tokens are zero tensors
        
        # hidden = [num layers * num directions, batch size, hid dim]
        # cell = [num layers * num directions, batch size, hid dim]
        
        # concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        # and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        output = self.fc1(hidden)
        output = self.dropout(self.fc2(output))
        
        output = self.sigmoid(output)
        
        #hidden = [batch size, hid dim * num directions]
        return output

In [41]:
model = LSTM_net(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [42]:
#  to initiaise padded to zeros
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 1.9269,  1.4873,  0.9007,  ...,  0.2539,  0.9364,  0.7122],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.4982, -1.2000,  0.1271,  ..., -0.3867,  0.9578, -0.8225],
        ...,
        [-0.5829,  0.5939, -0.3095,  ..., -2.1681,  2.1054,  0.9226],
        [-1.0517, -0.5362, -0.1695,  ...,  2.3034, -0.9964,  0.2445],
        [ 0.0448, -1.0326, -0.6188,  ...,  0.6355,  0.1746,  0.9967]])


In [43]:
model.to(device) #CNN to GPU


# Loss and optimizer
criterion = nn.BCEWithLogitsLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [44]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [45]:
def train(model, iterator):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        text, text_lengths = batch.text
        
        optimizer.zero_grad()
        predictions = model(text, text_lengths).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)

        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [46]:
def evaluate(model, iterator):
    
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch.text
            predictions = model(text, text_lengths).squeeze(1)
            acc = binary_accuracy(predictions, batch.label)
            
            epoch_acc += acc.item()
        
    return epoch_acc / len(iterator)

In [47]:
t = time.time()
loss=[]
acc=[]
val_acc=[]

for epoch in range(num_epochs):
    
    train_loss, train_acc = train(model, train_iterator)
    valid_acc = evaluate(model, valid_iterator)
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Acc: {valid_acc*100:.2f}%')
    
    loss.append(train_loss)
    acc.append(train_acc)
    val_acc.append(valid_acc)
    
print(f'time:{time.time()-t:.3f}')

	Train Loss: 0.693 | Train Acc: 50.93%
	 Val. Acc: 49.12%
	Train Loss: 0.681 | Train Acc: 51.84%
	 Val. Acc: 49.11%
	Train Loss: 0.676 | Train Acc: 52.94%
	 Val. Acc: 49.56%
	Train Loss: 0.673 | Train Acc: 54.86%
	 Val. Acc: 50.50%
	Train Loss: 0.670 | Train Acc: 56.24%
	 Val. Acc: 51.54%
	Train Loss: 0.668 | Train Acc: 57.20%
	 Val. Acc: 54.54%
	Train Loss: 0.666 | Train Acc: 58.63%
	 Val. Acc: 56.46%
	Train Loss: 0.664 | Train Acc: 59.15%
	 Val. Acc: 56.04%
	Train Loss: 0.662 | Train Acc: 59.80%
	 Val. Acc: 59.33%
	Train Loss: 0.662 | Train Acc: 60.65%
	 Val. Acc: 58.83%
	Train Loss: 0.660 | Train Acc: 61.23%
	 Val. Acc: 61.24%
	Train Loss: 0.659 | Train Acc: 61.24%
	 Val. Acc: 61.61%
	Train Loss: 0.658 | Train Acc: 61.96%
	 Val. Acc: 63.19%
	Train Loss: 0.657 | Train Acc: 62.53%
	 Val. Acc: 63.87%
	Train Loss: 0.655 | Train Acc: 62.89%
	 Val. Acc: 64.60%
	Train Loss: 0.655 | Train Acc: 63.52%
	 Val. Acc: 65.63%
	Train Loss: 0.655 | Train Acc: 63.80%
	 Val. Acc: 66.20%
	Train Loss: 0

In [48]:
# model = LogisticRegression(tfidf_train.shape[1], 1)
# model.load_state_dict(torch.load("models/best_model.pth"))
# torch.save(model.state_dict(), "models/torch_baseline_logistic_regression.pth")
# model.load_state_dict(torch.load("models/torch_baseline_logistic_regression.pth"))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

LSTM_net(
  (embedding): Embedding(79462, 300, padding_idx=1)
  (rnn): LSTM(300, 256, num_layers=2, dropout=0.4, bidirectional=True)
  (fc1): Linear(in_features=512, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=1, bias=True)
  (dropout): Dropout(p=0.4, inplace=False)
  (sigmoid): Sigmoid()
)

In [49]:
ground_truth = []
results_by_batch = []
labels = []
with torch.no_grad():
    for batch in tqdm(valid_iterator, total=len(valid_iterator) / config.batch_size):
        text, text_lengths = batch.text
        batch_pred = model(text, text_lengths).squeeze(1)
        results_by_batch.append(batch_pred.detach().cpu().numpy())
        ground_truth.append(batch.label.cpu().numpy())
y_test = np.concatenate(ground_truth, 0)
y_preds = np.concatenate(results_by_batch, 0)

HBox(children=(FloatProgress(value=0.0, max=2.0703125), HTML(value='')))




In [50]:
print('accuracy score: ',accuracy_score(y_test, y_preds.round()))
print('\n')
print('confusion matrix: \n',confusion_matrix(y_test,y_preds.round()))
print('\n')
print(classification_report(y_test, y_preds.round()))

accuracy score:  0.6969384143463898


confusion matrix: 
 [[29223  5316]
 [15234 18035]]


              precision    recall  f1-score   support

         0.0       0.66      0.85      0.74     34539
         1.0       0.77      0.54      0.64     33269

    accuracy                           0.70     67808
   macro avg       0.71      0.69      0.69     67808
weighted avg       0.71      0.70      0.69     67808



In [51]:
f1_score(y_test, y_preds.round(), average="macro")

0.6884571452978254

In [None]:
tokenizer = lambda x: str(x).split()

In [47]:
TEXT = data.Field(sequential=True, 
                       tokenize=tokenizer, 
                       include_lengths=True, 
                       use_vocab=True)

LABEL = data.Field(sequential=False, 
                         use_vocab=False, 
                         pad_token=None, 
                         unk_token=None)

fields = [('text',TEXT), ('label', LABEL)]

trainds, valds = data.TabularDataset.splits(path='../data', 
                                            format='csv', 
                                            train='train_processed_data.csv', 
                                            validation='test_processed_data.csv', 
                                            fields=fields, 
                                            skip_header=True)

In [48]:
type(trainds)

torchtext.data.dataset.TabularDataset

In [49]:
len(trainds), len(valds)

(158783, 68051)

In [50]:
trainds.fields.items()

dict_items([('text', <torchtext.data.field.Field object at 0x7fec3c3f4040>), ('label', <torchtext.data.field.Field object at 0x7fec3c3f4430>)])

In [51]:
ex = trainds[0]
type(ex)

torchtext.data.example.Example

In [52]:
ex.text

['luna', 'самый', 'самый', 'любимый', 'рождественский', 'песенка', 'год']

In [53]:
ex.label

'1'

In [54]:
TEXT.build_vocab([trainds.text, valds.text], min_freq=3)
LABEL.build_vocab(trainds.label)

In [55]:
vocab_size = len(TEXT.vocab)
vocab_size

29027

In [56]:
TEXT.vocab.itos[:20]

['<unk>',
 '<pad>',
 'хотеть',
 'весь',
 'день',
 'мочь',
 'такой',
 'сегодня',
 'очень',
 'быть',
 'ты',
 'один',
 'просто',
 'мой',
 'год',
 'хороший',
 'человек',
 'знать',
 'любить',
 'завтра']

In [57]:
print(TEXT.vocab.freqs.most_common(10)) 

[('хотеть', 11807), ('весь', 10570), ('день', 9982), ('мочь', 9172), ('такой', 8928), ('сегодня', 8786), ('очень', 7653), ('быть', 7331), ('ты', 7083), ('один', 6912)]


In [58]:
print(list(TEXT.vocab.stoi.keys())[:20]) 

['<unk>', '<pad>', 'хотеть', 'весь', 'день', 'мочь', 'такой', 'сегодня', 'очень', 'быть', 'ты', 'один', 'просто', 'мой', 'год', 'хороший', 'человек', 'знать', 'любить', 'завтра']


In [59]:
print("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
print("Label Length: " + str(len(LABEL.vocab)))

Length of Text Vocabulary: 29027
Label Length: 2


In [60]:
train_iterator, valid_iterator = data.BucketIterator.splits(
    datasets=(trainds, valds), # specify train and validation Tabulardataset
    batch_size=config.batch_size,  # batch size of train and validation
    sort_key=lambda x: len(x.text), # on what attribute the text should be sorted
    device=config.device, # -1 mean cpu and 0 or None mean gpu
    sort_within_batch=True, 
    repeat=False
)

In [61]:
print(len(train_iterator), len(valid_iterator))

621 266


In [62]:
batch = next(iter(train_iterator)) # BucketIterator return a batch object
print(type(batch))

<class 'torchtext.data.batch.Batch'>


In [63]:
print(batch.text)

(tensor([[ 5126,   703,  5329,  ...,   673,   293, 27600],
        [ 1912, 22628,    21,  ...,     7,   676,     0],
        [ 1206,  1792,   413,  ...,     8,   217,     0],
        [  245,  4096,    29,  ...,   545,   272,     0],
        [   13,   440,   302,  ..., 14406,    18,     0],
        [  767,   490,  9641,  ...,   744,   552,     0]], device='cuda:0'), tensor([6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6

In [64]:
print(batch.label)

tensor([1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1,
        0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
        1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1,
        1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
        0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
        1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1,
        0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,
        0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
        1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
        1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1], device='cuda:0')


In [65]:
print(batch.dataset.fields)

{'text': <torchtext.data.field.Field object at 0x7fec3c3f4040>, 'label': <torchtext.data.field.Field object at 0x7fec3c3f4430>}


In [66]:
from torchtext.data import Iterator, BucketIterator

In [67]:
class TextLinearClassifier(nn.Module):
    
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextLinearClassifier, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.linear = nn.Linear(embed_dim, num_class)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, text_len):
        embedded = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_len)
        outputs = self.linear(embedded)
        y_pred = self.sigmoid(outputs.view(1, -1).squeeze(0))
        return y_pred
    
model = TextLinearClassifier(vocab_size, config.embed_dim, 1)
model.to(config.device)

TextLinearClassifier(
  (embedding): EmbeddingBag(29027, 100, mode=mean)
  (linear): Linear(in_features=100, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [68]:
# Подсчет количества тренеруемых параметров модели
sum(p.numel() for p in model.parameters() if p.requires_grad)

2902801

In [69]:
criterion = nn.BCELoss()
optimizer = optim.SparseAdam(model.parameters(), lr=config.lr)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True, mode="max", factor=0.3)

In [70]:
class Trainer:
    def __init__(self, model, train_dataloader: DataLoader, valid_dataloader: DataLoader, criterion, optimizer, scheduler, config: ConfigExperiment):
        self.model = model
        self.train_dataloader = train_dataloader
        self.valid_dataloader = valid_dataloader
        self.criterion = criterion
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.device = config.device
        self.config = config
        self.max_train_iterations = 40
#         self.max_train_iterations = len(self.train_dataloader)
        self.max_valid_iterations = 10
#         self.max_valid_iterations = len(self.valid_dataloader)
        self.train_metrics = {
            'avg_loss': [],
            'accuracy': [],
            'f1': [],
        }
        self.valid_metrics = {
            'avg_loss': [],
            'accuracy': [],
            'f1': [],
        }
        self.counter = 0
        self.delta = config.early_stopping_delta
      
    def run(self):
        self.model.to(self.device)
        best_valid_loss = float('inf')
        best_target_metric = 0

        try:
            for i_epoch in tqdm(range(self.config.num_epochs), desc='Epochs', total=config.num_epochs, position=1, leave=True):
                start_time = time.time()

                train_loss, train_outputs, train_targets = self._train()
                valid_loss, valid_outputs, valid_targets = self._evaluate()
                    
                self.train_metrics["avg_loss"].append(train_loss)
                self.train_metrics["accuracy"].append(accuracy_score(train_targets, train_outputs.round()))
                self.train_metrics["f1"].append(f1_score(train_targets, train_outputs.round(), average="macro"))
                
                self.valid_metrics["avg_loss"].append(valid_loss)
                self.valid_metrics["accuracy"].append(accuracy_score(valid_targets, valid_outputs.round()))
                self.valid_metrics["f1"].append(f1_score(valid_targets, valid_outputs.round(), average="macro"))
                
                end_time = time.time()
                epoch_mins, epoch_secs = self._epoch_time(start_time, end_time)
                self.print_progress(i_epoch, epoch_mins, epoch_secs)
                
                self.scheduler.step(self.valid_metrics["f1"][-1])
                
                if valid_loss < best_valid_loss:
                    best_valid_loss = valid_loss
                    torch.save(model.state_dict(), f"{config.save_dirname}/best_model.pth")
                    
                if self.valid_metrics["f1"][-1] > best_target_metric:
                    self.counter = 0
                    best_target_metric = self.valid_metrics["f1"][-1]
                    torch.save(model.state_dict(), f"{config.save_dirname}/best_model.pth")
                else:
                    self.counter += 1
                    
                if self.counter > self.config.patience:
                    print("EarlyStopping")
                    break
        except KeyboardInterrupt:
            pass
        
        return self.train_metrics, self.valid_metrics
        
    def _train(self):
        model.train()
        epoch_loss = 0
        epoch_output = None
        epoch_target = None
        for i, batch in tqdm(enumerate(self.train_dataloader), desc='Train', total=self.max_train_iterations, position=2, leave=True):
            if i >= self.max_train_iterations:
                break
                
            loss_iten, outputs = self._train_process(batch)
            epoch_loss += loss_iten 

            if epoch_output is None:
                epoch_output = outputs.cpu().data
            else:
                epoch_output = torch.cat((epoch_output, outputs.cpu().data))

            if epoch_target is None:
                epoch_target = labels.cpu().data
            else:
                epoch_target = torch.cat((epoch_target, labels.cpu().data))
            
        return epoch_loss / len(self.train_dataloader), epoch_output, epoch_target
    
    def _train_process(self, batch):
        text, text_len = batch.text
        labels = batch.label
        
#         print(text)
#         print(text_len)
#         print(labels)
#         print(text.shape, labels.shape)
        
        
#         text, labels = text.to(self.device), labels.to(self.device)
        
        self.optimizer.zero_grad()
        outputs = self.model(text, text_len)
        loss = self.criterion(outputs, labels)
        loss.backward()
        self.optimizer.step()
        return loss.item(), outputs
            
    def _evaluate(self):
        model.eval()
        epoch_loss = 0
        epoch_output = None
        epoch_target = None
        with torch.no_grad():
            for i, batch in tqdm(enumerate(self.valid_dataloader), desc='Valid', total=self.max_valid_iterations, position=3, leave=True):
                if i >= self.max_valid_iterations:
                    break
                
                text, text_len = batch.text
                labels = batch.label
#                 text, text_lengths = batch.text
#                 text, text_lengths = text.to(self.device), text_lengths.to(self.device)
#                 labels = batch.label.to(self.device)
                outputs = model(text, text_len)
                loss = criterion(outputs, labels)
                epoch_loss += loss.item()
                
                if epoch_output is None:
                    epoch_output = outputs.cpu().data
                else:
                    epoch_output = torch.cat((epoch_output, outputs.cpu().data))

                if epoch_target is None:
                    epoch_target = labels.cpu().data
                else:
                    epoch_target = torch.cat((epoch_target, labels.cpu().data))

        return epoch_loss / len(self.valid_dataloader), epoch_output, epoch_target
 
    def _epoch_time(self, start_time, end_time):
        elapsed_time = end_time - start_time
        elapsed_mins = int(elapsed_time / 60)
        elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
        return elapsed_mins, elapsed_secs

    def print_progress(self, i_epoch, epoch_mins, epoch_secs):
        i_epoch = i_epoch + 1
        print(f"Epoch: {i_epoch:02} | Time: {epoch_mins}m {epoch_secs}s")
        print("Training Results - Average Loss: {:.4f} | accuracy: {:.4f} | f1: {:.4f}"
            .format(
                self.train_metrics['avg_loss'][-1], 
                self.train_metrics['accuracy'][-1],
                self.train_metrics['f1'][-1],
            ))
        print("Evaluating Results - Average Loss: {:.4f} | accuracy: {:.4f} | f1: {:.4f}"
            .format( 
                self.valid_metrics['avg_loss'][-1],
                self.valid_metrics['accuracy'][-1],
                self.valid_metrics['f1'][-1],
            ))
        print()



In [71]:
trainer = Trainer(model, train_iterator, valid_iterator, criterion, optimizer, scheduler, config)
trainer.run();

HBox(children=(FloatProgress(value=0.0, description='Epochs', max=50.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…





RuntimeError: Expected `len(lengths)` to be equal to batch_size, but got 256 (batch_size=100)

In [None]:
model = LogisticRegression(tfidf_train.shape[1], 1)
model.load_state_dict(torch.load("models/best_model.pth"))
torch.save(model.state_dict(), "models/torch_linear_model.pth")
model.load_state_dict(torch.load("models/torch_linear_model.pth"))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

In [None]:
results_by_batch = []
labels = []
with torch.no_grad():
    for batch_x, batch_y in tqdm(valid_dataloader, total=len(valid_dataset) / config.batch_size):
        batch_x = batch_x.to(config.device)
        batch_pred = model(batch_x)
        results_by_batch.append(batch_pred.detach().cpu().numpy())
        
y_preds = np.concatenate(results_by_batch, 0)

In [None]:
print('accuracy score: ',accuracy_score(y_test, y_preds.round()))
print('\n')
print('confusion matrix: \n',confusion_matrix(y_test,y_preds.round()))
print('\n')
print(classification_report(y_test, y_preds.round()))

In [None]:
f1_score(y_test, y_preds.round(), average="macro")