In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd

import copy
import traceback
import datetime
import joblib
import re
import os
import random
import string
import time

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
%matplotlib inline

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torchtext import data
from torchtext import datasets
from torchtext import vocab
from torchtext.vocab import Vectors, GloVe

from tqdm.notebook import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import pymorphy2
from sklearn.base import BaseEstimator, TransformerMixin

import gensim

In [3]:
class ConfigExperiment:
    seed = 42
    positive_file = "../data/positive.csv"
    negative_file = "../data/negative.csv"
    device = "cuda" if torch.cuda.is_available() else "cpu"
    embed_dim = 300
    max_vocab_size = 50_000
    batch_size = 2048
    num_epochs = 50
    lr = 1e-2
    num_workers = 0
    patience = 5
    early_stopping_delta = 2
    save_dirname = "models"
    
config = ConfigExperiment()

In [4]:
def init_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic=True
    
init_random_seed(config.seed)

### Подготовка данных

In [5]:
train = pd.read_csv("../data/train_processed_data_regression.csv", index_col=False)
validate = pd.read_csv("../data/validate_processed_data_regression.csv", index_col=False)
test = pd.read_csv("../data/test_processed_data_regression.csv", index_col=False)


tokenize = lambda x: str(x).split()
TEXT = data.Field(sequential=True, tokenize=tokenize, include_lengths=True)
# число всех сообщений пользователя в сети twitter;
TSTCOUNT = data.Field(sequential=False, use_vocab=False, dtype=torch.float)
# количество фоловеров пользователя (тех людей, которые читают пользователя);
TFOLL = data.Field(sequential=False, use_vocab=False, dtype=torch.float)
# количество друзей пользователя (те люди, которых читает пользователь);
TFRIEN = data.Field(sequential=False, use_vocab=False, dtype=torch.float)
# количество листов-подписок в которые добавлен твиттер-пользователь.
LISTCOUNT = data.Field(sequential=False, use_vocab=False, dtype=torch.float)
# число ретвитов
TARGET = data.Field(sequential=False, use_vocab=False, dtype=torch.float)

fields = [('text',TEXT), ('tstcount', TSTCOUNT), ('tfoll', TFOLL), ('tfrien', TFRIEN), ('listcount', LISTCOUNT), ('target', TARGET)]

train_data, valid_data, test_data = data.TabularDataset.splits(
    path="../data/",
    train="train_processed_data_regression.csv",
    validation="validate_processed_data_regression.csv",
    test="test_processed_data_regression.csv",
    format="csv",
    fields=fields,
    skip_header=True)

TEXT.build_vocab(train_data, min_freq=2)

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    sort_key = lambda x: x.text,
    batch_size=config.batch_size,
    device=config.device)

### Скачать и рапаковать предобученные веса

In [6]:
# from zipfile import ZipFile
# import wget

# model_url = 'http://vectors.nlpl.eu/repository/11/187.zip'
# wget.download(model_url)
# with ZipFile('187.zip', 'r') as zipObj:
#    # Extract all the contents of zip file in different directory
#    zipObj.extractall('187')

### Составить матрицу предобученных весов для словаря 

In [7]:
w2v_model = gensim.models.KeyedVectors.load('187/model.model')
numpy_embeddings = np.zeros(shape=[len(TEXT.vocab), config.embed_dim],dtype=np.float32)

for word in TEXT.vocab.itos:
    vector = w2v_model.get_vector(word)
    index  = TEXT.vocab.stoi[word]
    numpy_embeddings[index] = vector
    
pretrained_embeddings = torch.Tensor(numpy_embeddings).float()
pretrained_embeddings.shape

torch.Size([33041, 300])

### Создание модели

In [8]:
class MultymodalNet(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, hidden, output_dim, n_layers, bidirectional, dropout, pad_idx):
        super().__init__()
        
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, padding_idx=pad_idx, freeze=False)
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        self.fc_text = nn.Linear(hidden_dim * 2, hidden)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden + 4, output_dim)

    def forward(self, text, text_lengths, tstcount, tfoll, tfrien, listcount):
        
        tstcount, tfoll, tfrien, listcount = tstcount.unsqueeze(1), tfoll.unsqueeze(1), tfrien.unsqueeze(1), listcount.unsqueeze(1)
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, enforce_sorted=False)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)

        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        
        text_features = self.dropout(self.fc_text(hidden))
        
        cat = torch.cat([text_features, tstcount, tfoll, tfrien, listcount], dim=1)

        result = self.fc(cat)
            
        return result


In [9]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = config.embed_dim
HIDDEN_DIM = 256
HIDDEN = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

In [10]:
model = MultymodalNet(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, HIDDEN, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)

In [11]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 12,763,633 trainable parameters


### Копирование предобученных весов в модель

In [12]:
model.embedding.weight.data.copy_(pretrained_embeddings)

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [13]:
optimizer = optim.Adam(model.parameters())
criterion = nn.MSELoss()
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2, verbose=True, mode="max", factor=0.3)

model = model.to(config.device)
criterion = criterion.to(config.device)

In [14]:
class Trainer:
    def __init__(self, model, train_dataloader: DataLoader, valid_dataloader: DataLoader, 
                 criterion, optimizer, scheduler, config: ConfigExperiment, model_name: str):
        self.model = model
        self.train_dataloader = train_dataloader
        self.valid_dataloader = valid_dataloader
        self.criterion = criterion
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.device = config.device
        self.config = config
        self.threshold = 0.5
        self.model_name = model_name
        self.train_metrics = {
            'avg_loss': [],
            'mean_squared_error': [],
            'mean_absolute_error': [],
        }
        self.valid_metrics = {
            'avg_loss': [],
            'mean_squared_error': [],
            'mean_absolute_error': [],
        }
        self.counter = 0
        self.delta = config.early_stopping_delta
      
    def run(self):
        self.model.to(self.device)
        best_valid_loss = float('inf')
        best_target_metric = float('inf')

        try:
            for i_epoch in tqdm(range(self.config.num_epochs), desc='Epochs', total=config.num_epochs, position=1, leave=True):
                start_time = time.time()

                train_loss, train_outputs, train_targets = self._train()
                valid_loss, valid_outputs, valid_targets = self._evaluate()
                    
                self.train_metrics["avg_loss"].append(train_loss)
                self.train_metrics["mean_squared_error"].append(mean_squared_error(train_targets, train_outputs.round()))
                self.train_metrics["mean_absolute_error"].append(mean_absolute_error(train_targets, train_outputs.round()))
                
                self.valid_metrics["avg_loss"].append(valid_loss)
                self.valid_metrics["mean_squared_error"].append(mean_squared_error(valid_targets, valid_outputs.round()))
                self.valid_metrics["mean_absolute_error"].append(mean_absolute_error(valid_targets, valid_outputs.round()))
                
                end_time = time.time()
                epoch_mins, epoch_secs = self._epoch_time(start_time, end_time)
                self.print_progress(i_epoch, epoch_mins, epoch_secs)
                
                self.scheduler.step(self.valid_metrics["mean_squared_error"][-1])
                
                if valid_loss < best_valid_loss:
                    best_valid_loss = valid_loss
                    torch.save(model.state_dict(), f"{config.save_dirname}/{self.model_name}.pth")
                    
                if self.valid_metrics["mean_squared_error"][-1] < best_target_metric:
                    self.counter = 0
                    best_target_metric = self.valid_metrics["mean_squared_error"][-1]
                    torch.save(model.state_dict(), f"{config.save_dirname}/{self.model_name}.pth")
                else:
                    self.counter += 1
                    
                if self.counter > self.config.patience:
                    print("EarlyStopping")
                    break
        except KeyboardInterrupt:
            pass
        
        return self.train_metrics, self.valid_metrics
        
    def _train(self):
        model.train()
        epoch_loss = 0
        epoch_output = None
        epoch_target = None
        for i, batch in tqdm(enumerate(self.train_dataloader), desc='Train', total=len(self.train_dataloader), position=2, leave=True):
            loss_iten, outputs = self._train_process(batch)
            epoch_loss += loss_iten 

            if epoch_output is None:
                epoch_output = outputs.cpu().data
            else:
                epoch_output = torch.cat((epoch_output, outputs.cpu().data))

            if epoch_target is None:
                epoch_target = batch.target.cpu().data
            else:
                epoch_target = torch.cat((epoch_target, batch.target.cpu().data))
            
        return epoch_loss / len(self.train_dataloader), epoch_output, epoch_target
    
    def _train_process(self, batch):      
        self.optimizer.zero_grad()
        text, text_lengths = batch.text
        outputs = self.model(text, text_lengths, batch.tstcount, batch.tfoll, batch.tfrien, batch.listcount).squeeze(1)
        loss = self.criterion(outputs, batch.target)
        loss.backward()
        self.optimizer.step()
        return loss.item(), outputs
            
    def _evaluate(self):
        self.model.eval()
        epoch_loss = 0
        epoch_output = None
        epoch_target = None
        with torch.no_grad():
            for i, batch in tqdm(enumerate(self.valid_dataloader), desc='Valid', total=len(self.valid_dataloader), position=3, leave=True):
                text, text_lengths = batch.text
                outputs = self.model(text, text_lengths, batch.tstcount, batch.tfoll, batch.tfrien, batch.listcount).squeeze(1)
                loss = criterion(outputs, batch.target)
                epoch_loss += loss.item()
                
                if epoch_output is None:
                    epoch_output = outputs.cpu().data
                else:
                    epoch_output = torch.cat((epoch_output, outputs.cpu().data))

                if epoch_target is None:
                    epoch_target = batch.target.cpu().data
                else:
                    epoch_target = torch.cat((epoch_target, batch.target.cpu().data))

        return epoch_loss / len(self.valid_dataloader), epoch_output, epoch_target
 
    def _epoch_time(self, start_time, end_time):
        elapsed_time = end_time - start_time
        elapsed_mins = int(elapsed_time / 60)
        elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
        return elapsed_mins, elapsed_secs

    def print_progress(self, i_epoch, epoch_mins, epoch_secs):
        if type(i_epoch) != str:
            i_epoch = i_epoch + 1
            print(f"Epoch: {i_epoch:02} | Time: {epoch_mins}m {epoch_secs}s")
            print("Training Results - Average Loss: {:.4f} | MSE: {:.4f} | MAE: {:.4f}"
                .format(
                    self.train_metrics['avg_loss'][-1], 
                    self.train_metrics['mean_squared_error'][-1],
                    self.train_metrics['mean_absolute_error'][-1],
                ))      
        else:
            print(f"Epoch: {i_epoch} | Time: {epoch_mins}m {epoch_secs}s")
        print("Evaluating Results - Average Loss: {:.4f} | MSE: {:.4f} | MAE: {:.4f}"
            .format( 
                self.valid_metrics['avg_loss'][-1],
                self.valid_metrics['mean_squared_error'][-1],
                self.valid_metrics['mean_absolute_error'][-1],
            ))
        print()

    def set_model(self, model: nn.Module):
        self.model = model
        
    def evaluate(self, dataloader: DataLoader):
        self.valid_dataloader = dataloader
        self.model.to(self.device)
        start_time = time.time()

        valid_loss, valid_outputs, valid_targets = self._evaluate()

        self.valid_metrics["avg_loss"].append(valid_loss)
        self.valid_metrics["mean_squared_error"].append(mean_squared_error(valid_targets, valid_outputs.round()))
        self.valid_metrics["mean_absolute_error"].append(mean_absolute_error(valid_targets, valid_outputs.round()))

        end_time = time.time()
        epoch_mins, epoch_secs = self._epoch_time(start_time, end_time)
        self.print_progress("evaluate", epoch_mins, epoch_secs)


In [15]:
trainer = Trainer(model, train_iterator, valid_iterator, criterion, optimizer, scheduler, config, "09_retweet_regression_from_lstm")
trainer.run();

HBox(children=(FloatProgress(value=0.0, description='Epochs', max=50.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 01 | Time: 0m 17s
Training Results - Average Loss: 191131.2943 | MSE: 192589.9062 | MAE: 145.5068
Evaluating Results - Average Loss: 20824.4263 | MSE: 20132.6699 | MAE: 35.9160



HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 02 | Time: 0m 17s
Training Results - Average Loss: 13496.8538 | MSE: 13462.2617 | MAE: 20.4265
Evaluating Results - Average Loss: 8851.8665 | MSE: 8620.8945 | MAE: 14.4837



HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 03 | Time: 0m 18s
Training Results - Average Loss: 9138.5313 | MSE: 9189.9736 | MAE: 11.8706
Evaluating Results - Average Loss: 6804.5360 | MSE: 6658.3750 | MAE: 10.4966



HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 04 | Time: 0m 17s
Training Results - Average Loss: 8042.3037 | MSE: 8076.2544 | MAE: 10.3542
Evaluating Results - Average Loss: 6258.4759 | MSE: 6133.1641 | MAE: 11.2283

Epoch     4: reducing learning rate of group 0 to 3.0000e-04.


HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 05 | Time: 0m 17s
Training Results - Average Loss: 7753.8559 | MSE: 7787.4126 | MAE: 10.5595
Evaluating Results - Average Loss: 6227.2277 | MSE: 6095.3945 | MAE: 9.6863



HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 06 | Time: 0m 17s
Training Results - Average Loss: 7731.4968 | MSE: 7766.4009 | MAE: 10.0158
Evaluating Results - Average Loss: 6165.6816 | MSE: 6016.4668 | MAE: 10.0082



HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 07 | Time: 0m 18s
Training Results - Average Loss: 7905.4926 | MSE: 7860.8970 | MAE: 10.3486
Evaluating Results - Average Loss: 6158.8091 | MSE: 6015.3579 | MAE: 9.7378

Epoch     7: reducing learning rate of group 0 to 9.0000e-05.


HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 08 | Time: 0m 17s
Training Results - Average Loss: 7587.6795 | MSE: 7642.5435 | MAE: 10.0852
Evaluating Results - Average Loss: 6155.9701 | MSE: 6016.8867 | MAE: 10.0422



HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 09 | Time: 0m 17s
Training Results - Average Loss: 7768.7930 | MSE: 7669.6001 | MAE: 10.3495
Evaluating Results - Average Loss: 6157.2638 | MSE: 6024.6021 | MAE: 10.0224



HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 10 | Time: 0m 17s
Training Results - Average Loss: 7666.7607 | MSE: 7680.7188 | MAE: 10.2846
Evaluating Results - Average Loss: 6150.9420 | MSE: 6017.4902 | MAE: 9.8839

Epoch    10: reducing learning rate of group 0 to 2.7000e-05.


HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 11 | Time: 0m 18s
Training Results - Average Loss: 7607.2610 | MSE: 7632.9512 | MAE: 10.1436
Evaluating Results - Average Loss: 6144.9344 | MSE: 6010.1450 | MAE: 9.9278



HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 12 | Time: 0m 18s
Training Results - Average Loss: 7572.4367 | MSE: 7626.5547 | MAE: 10.1855
Evaluating Results - Average Loss: 6147.5089 | MSE: 6015.6753 | MAE: 9.9952



HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 13 | Time: 0m 17s
Training Results - Average Loss: 7576.8971 | MSE: 7631.7192 | MAE: 10.2139
Evaluating Results - Average Loss: 6147.0620 | MSE: 6015.8545 | MAE: 10.0415

Epoch    13: reducing learning rate of group 0 to 8.1000e-06.


HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 14 | Time: 0m 17s
Training Results - Average Loss: 9027.4975 | MSE: 7622.4229 | MAE: 10.2467
Evaluating Results - Average Loss: 6147.9808 | MSE: 6018.9053 | MAE: 10.0805



HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 15 | Time: 0m 17s
Training Results - Average Loss: 7569.6504 | MSE: 7623.2920 | MAE: 10.2884
Evaluating Results - Average Loss: 6149.0961 | MSE: 6020.7729 | MAE: 10.0884



HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 16 | Time: 0m 17s
Training Results - Average Loss: 7561.2696 | MSE: 7621.2017 | MAE: 10.2893
Evaluating Results - Average Loss: 6148.3574 | MSE: 6020.0684 | MAE: 10.0836

Epoch    16: reducing learning rate of group 0 to 2.4300e-06.


HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 17 | Time: 0m 18s
Training Results - Average Loss: 7570.6969 | MSE: 7620.4917 | MAE: 10.2742
Evaluating Results - Average Loss: 6148.8165 | MSE: 6020.2842 | MAE: 10.0782

EarlyStopping



In [16]:
trainer = Trainer(model, train_iterator, valid_iterator, criterion, optimizer, scheduler, config, "09_retweet_regression_from_lstm")
model.load_state_dict(torch.load(f'{config.save_dirname}/09_retweet_regression_from_lstm.pth'))
trainer.set_model(model)
trainer.evaluate(test_iterator)

HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: evaluate | Time: 0m 2s
Evaluating Results - Average Loss: 12428.4574 | MSE: 12895.6846 | MAE: 10.9168

