In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd

import copy
import traceback
import datetime
import joblib
import re
import os
import random
import string
import time

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
%matplotlib inline

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torchtext import data
from torchtext import datasets
from torchtext import vocab
from torchtext.vocab import Vectors, GloVe

from tqdm.notebook import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import pymorphy2
from sklearn.base import BaseEstimator, TransformerMixin

import gensim

In [3]:
class ConfigExperiment:
    seed = 42
    positive_file = "../data/positive.csv"
    negative_file = "../data/negative.csv"
    device = "cuda" if torch.cuda.is_available() else "cpu"
    embed_dim = 300
    max_vocab_size = 50_000
    batch_size = 2048
    num_epochs = 50
    lr = 1e-2
    num_workers = 0
    patience = 5
    early_stopping_delta = 1
    save_dirname = "models"
    
config = ConfigExperiment()

In [4]:
def init_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic=True
    
init_random_seed(config.seed)

### Подготовка данных

In [5]:
train = pd.read_csv("../data/train_processed_data_regression.csv", index_col=False)
validate = pd.read_csv("../data/validate_processed_data_regression.csv", index_col=False)
test = pd.read_csv("../data/test_processed_data_regression.csv", index_col=False)


tokenize = lambda x: str(x).split()
TEXT = data.Field(sequential=True, tokenize=tokenize, batch_first=True)
# число всех сообщений пользователя в сети twitter;
TSTCOUNT = data.Field(sequential=False, use_vocab=False, dtype=torch.float)
# количество фоловеров пользователя (тех людей, которые читают пользователя);
TFOLL = data.Field(sequential=False, use_vocab=False, dtype=torch.float)
# количество друзей пользователя (те люди, которых читает пользователь);
TFRIEN = data.Field(sequential=False, use_vocab=False, dtype=torch.float)
# количество листов-подписок в которые добавлен твиттер-пользователь.
LISTCOUNT = data.Field(sequential=False, use_vocab=False, dtype=torch.float)
# число ретвитов
TARGET = data.Field(sequential=False, use_vocab=False, dtype=torch.float)

fields = [('text',TEXT), ('tstcount', TSTCOUNT), ('tfoll', TFOLL), ('tfrien', TFRIEN), ('listcount', LISTCOUNT), ('target', TARGET)]

train_data, valid_data, test_data = data.TabularDataset.splits(
    path="../data/",
    train="train_processed_data_regression.csv",
    validation="validate_processed_data_regression.csv",
    test="test_processed_data_regression.csv",
    format="csv",
    fields=fields,
    skip_header=True)

TEXT.build_vocab(train_data, min_freq=2)

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    sort_key = lambda x: x.text,
    batch_size=config.batch_size,
    device=config.device)

### Скачать и рапаковать предобученные веса

In [6]:
# from zipfile import ZipFile
# import wget

# model_url = 'http://vectors.nlpl.eu/repository/11/187.zip'
# wget.download(model_url)
# with ZipFile('187.zip', 'r') as zipObj:
#    # Extract all the contents of zip file in different directory
#    zipObj.extractall('187')

### Составить матрицу предобученных весов для словаря 

In [7]:
w2v_model = gensim.models.KeyedVectors.load('187/model.model')
numpy_embeddings = np.zeros(shape=[len(TEXT.vocab), config.embed_dim],dtype=np.float32)

for word in TEXT.vocab.itos:
    vector = w2v_model.get_vector(word)
    index  = TEXT.vocab.stoi[word]
    numpy_embeddings[index] = vector
    
pretrained_embeddings = torch.Tensor(numpy_embeddings).float()
pretrained_embeddings.shape

torch.Size([33041, 300])

### Создание модели

In [8]:
class MultymodalNet(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, hidden, output_dim, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.convs = nn.ModuleList([
                                    nn.Conv1d(in_channels = embedding_dim, 
                                              out_channels = n_filters, 
                                              kernel_size = fs)
                                    for fs in filter_sizes
                                    ])
        
        self.fc_text = nn.Linear(len(filter_sizes) * n_filters, hidden)
        
        self.bn1 = nn.BatchNorm1d(hidden)
        
        self.dropout = nn.Dropout(dropout)
        
        self.fc = nn.Linear(hidden + 4, output_dim)

    def forward(self, text, tstcount, tfoll, tfrien, listcount):
        
        tstcount, tfoll, tfrien, listcount = tstcount.unsqueeze(1), tfoll.unsqueeze(1), tfrien.unsqueeze(1), listcount.unsqueeze(1)
        
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)  
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.permute(0, 2, 1)
        #embedded = [batch size, emb dim, sent len]
        
        conved = [F.relu(conv(embedded)) for conv in self.convs]
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))
        #cat = [batch size, n_filters * len(filter_sizes)]
        
        text_features = self.fc_text(cat)
        
        cat = torch.cat([text_features, tstcount, tfoll, tfrien, listcount], dim = 1)

        result = self.fc(cat)
            
        return result


In [9]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = config.embed_dim
N_FILTERS = 128
FILTER_SIZES = [2, 3]
HIDDEN = 256
OUTPUT_DIM = 1
DROPOUT = 0.5

PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

In [10]:
model = MultymodalNet(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, HIDDEN, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [11]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 10,170,609 trainable parameters


### Копирование предобученных весов в модель

In [12]:
model.embedding.weight.data.copy_(pretrained_embeddings)

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [13]:
optimizer = optim.Adam(model.parameters())
criterion = nn.MSELoss()
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2, verbose=True, mode="max", factor=0.3)

model = model.to(config.device)
criterion = criterion.to(config.device)

In [14]:
class Trainer:
    def __init__(self, model, train_dataloader: DataLoader, valid_dataloader: DataLoader, 
                 criterion, optimizer, scheduler, config: ConfigExperiment, model_name: str):
        self.model = model
        self.train_dataloader = train_dataloader
        self.valid_dataloader = valid_dataloader
        self.criterion = criterion
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.device = config.device
        self.config = config
        self.threshold = 0.5
        self.model_name = model_name
        self.train_metrics = {
            'avg_loss': [],
            'mean_squared_error': [],
            'mean_absolute_error': [],
        }
        self.valid_metrics = {
            'avg_loss': [],
            'mean_squared_error': [],
            'mean_absolute_error': [],
        }
        self.counter = 0
        self.delta = config.early_stopping_delta
      
    def run(self):
        self.model.to(self.device)
        best_valid_loss = float('inf')
        best_target_metric = float('inf')

        try:
            for i_epoch in tqdm(range(self.config.num_epochs), desc='Epochs', total=config.num_epochs, position=1, leave=True):
                start_time = time.time()

                train_loss, train_outputs, train_targets = self._train()
                valid_loss, valid_outputs, valid_targets = self._evaluate()
                    
                self.train_metrics["avg_loss"].append(train_loss)
                self.train_metrics["mean_squared_error"].append(mean_squared_error(train_targets, train_outputs.round()))
                self.train_metrics["mean_absolute_error"].append(mean_absolute_error(train_targets, train_outputs.round()))
                
                self.valid_metrics["avg_loss"].append(valid_loss)
                self.valid_metrics["mean_squared_error"].append(mean_squared_error(valid_targets, valid_outputs.round()))
                self.valid_metrics["mean_absolute_error"].append(mean_absolute_error(valid_targets, valid_outputs.round()))
                
                end_time = time.time()
                epoch_mins, epoch_secs = self._epoch_time(start_time, end_time)
                self.print_progress(i_epoch, epoch_mins, epoch_secs)
                
                self.scheduler.step(self.valid_metrics["mean_squared_error"][-1])
                
                if valid_loss < best_valid_loss:
                    best_valid_loss = valid_loss
                    torch.save(model.state_dict(), f"{config.save_dirname}/{self.model_name}.pth")
                    
                if self.valid_metrics["mean_squared_error"][-1] < best_target_metric:
                    self.counter = 0
                    best_target_metric = self.valid_metrics["mean_squared_error"][-1]
                    torch.save(model.state_dict(), f"{config.save_dirname}/{self.model_name}.pth")
                else:
                    self.counter += 1
                    
                if self.counter > self.config.patience:
                    print("EarlyStopping")
                    break
        except KeyboardInterrupt:
            pass
        
        return self.train_metrics, self.valid_metrics
        
    def _train(self):
        model.train()
        epoch_loss = 0
        epoch_output = None
        epoch_target = None
        for i, batch in tqdm(enumerate(self.train_dataloader), desc='Train', total=len(self.train_dataloader), position=2, leave=True):
            loss_iten, outputs = self._train_process(batch)
            epoch_loss += loss_iten 

            if epoch_output is None:
                epoch_output = outputs.cpu().data
            else:
                epoch_output = torch.cat((epoch_output, outputs.cpu().data))

            if epoch_target is None:
                epoch_target = batch.target.cpu().data
            else:
                epoch_target = torch.cat((epoch_target, batch.target.cpu().data))
            
        return epoch_loss / len(self.train_dataloader), epoch_output, epoch_target
    
    def _train_process(self, batch):      
        self.optimizer.zero_grad()
        outputs = self.model(batch.text, batch.tstcount, batch.tfoll, batch.tfrien, batch.listcount).squeeze(1)
        loss = self.criterion(outputs, batch.target)
        loss.backward()
        self.optimizer.step()
        return loss.item(), outputs
            
    def _evaluate(self):
        self.model.eval()
        epoch_loss = 0
        epoch_output = None
        epoch_target = None
        with torch.no_grad():
            for i, batch in tqdm(enumerate(self.valid_dataloader), desc='Valid', total=len(self.valid_dataloader), position=3, leave=True):
                outputs = self.model(batch.text, batch.tstcount, batch.tfoll, batch.tfrien, batch.listcount).squeeze(1)
                loss = criterion(outputs, batch.target)
                epoch_loss += loss.item()
                
                if epoch_output is None:
                    epoch_output = outputs.cpu().data
                else:
                    epoch_output = torch.cat((epoch_output, outputs.cpu().data))

                if epoch_target is None:
                    epoch_target = batch.target.cpu().data
                else:
                    epoch_target = torch.cat((epoch_target, batch.target.cpu().data))

        return epoch_loss / len(self.valid_dataloader), epoch_output, epoch_target
 
    def _epoch_time(self, start_time, end_time):
        elapsed_time = end_time - start_time
        elapsed_mins = int(elapsed_time / 60)
        elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
        return elapsed_mins, elapsed_secs

    def print_progress(self, i_epoch, epoch_mins, epoch_secs):
        if type(i_epoch) != str:
            i_epoch = i_epoch + 1
            print(f"Epoch: {i_epoch:02} | Time: {epoch_mins}m {epoch_secs}s")
            print("Training Results - Average Loss: {:.4f} | MSE: {:.4f} | MAE: {:.4f}"
                .format(
                    self.train_metrics['avg_loss'][-1], 
                    self.train_metrics['mean_squared_error'][-1],
                    self.train_metrics['mean_absolute_error'][-1],
                ))      
        else:
            print(f"Epoch: {i_epoch} | Time: {epoch_mins}m {epoch_secs}s")
        print("Evaluating Results - Average Loss: {:.4f} | MSE: {:.4f} | MAE: {:.4f}"
            .format( 
                self.valid_metrics['avg_loss'][-1],
                self.valid_metrics['mean_squared_error'][-1],
                self.valid_metrics['mean_absolute_error'][-1],
            ))
        print()

    def set_model(self, model: nn.Module):
        self.model = model
        
    def evaluate(self, dataloader: DataLoader):
        self.valid_dataloader = dataloader
        self.model.to(self.device)
        start_time = time.time()

        valid_loss, valid_outputs, valid_targets = self._evaluate()

        self.valid_metrics["avg_loss"].append(valid_loss)
        self.valid_metrics["mean_squared_error"].append(mean_squared_error(valid_targets, valid_outputs.round()))
        self.valid_metrics["mean_absolute_error"].append(mean_absolute_error(valid_targets, valid_outputs.round()))

        end_time = time.time()
        epoch_mins, epoch_secs = self._epoch_time(start_time, end_time)
        self.print_progress("evaluate", epoch_mins, epoch_secs)


In [15]:
trainer = Trainer(model, train_iterator, valid_iterator, criterion, optimizer, scheduler, config, "08-multymodal-reg")
trainer.run();

HBox(children=(FloatProgress(value=0.0, description='Epochs', max=50.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 01 | Time: 0m 10s
Training Results - Average Loss: 288469.8288 | MSE: 290596.4688 | MAE: 199.6941
Evaluating Results - Average Loss: 32072.4339 | MSE: 31263.7383 | MAE: 70.2725



HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 02 | Time: 0m 10s
Training Results - Average Loss: 22531.0055 | MSE: 22524.1758 | MAE: 35.9627
Evaluating Results - Average Loss: 12842.5804 | MSE: 12453.0439 | MAE: 19.6777



HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 03 | Time: 0m 12s
Training Results - Average Loss: 11956.0953 | MSE: 12019.2764 | MAE: 16.1119
Evaluating Results - Average Loss: 8137.8046 | MSE: 7924.3320 | MAE: 11.7711



HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 04 | Time: 0m 12s
Training Results - Average Loss: 8891.9603 | MSE: 8928.1777 | MAE: 10.7760
Evaluating Results - Average Loss: 6638.1710 | MSE: 6473.6597 | MAE: 11.6387

Epoch     4: reducing learning rate of group 0 to 3.0000e-04.


HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 05 | Time: 0m 11s
Training Results - Average Loss: 7997.4179 | MSE: 8030.5894 | MAE: 10.8807
Evaluating Results - Average Loss: 6484.9968 | MSE: 6321.9268 | MAE: 10.0076



HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 06 | Time: 0m 11s
Training Results - Average Loss: 7875.3441 | MSE: 7911.5928 | MAE: 9.9642
Evaluating Results - Average Loss: 6342.5184 | MSE: 6168.1055 | MAE: 10.1175



HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 07 | Time: 0m 12s
Training Results - Average Loss: 8009.7163 | MSE: 7990.1860 | MAE: 10.3109
Evaluating Results - Average Loss: 6270.2331 | MSE: 6109.5000 | MAE: 9.8203

Epoch     7: reducing learning rate of group 0 to 9.0000e-05.


HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 08 | Time: 0m 10s
Training Results - Average Loss: 7628.4672 | MSE: 7683.3716 | MAE: 9.9551
Evaluating Results - Average Loss: 6256.3482 | MSE: 6101.6665 | MAE: 10.0127



HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 09 | Time: 0m 11s
Training Results - Average Loss: 7812.0286 | MSE: 7708.9785 | MAE: 10.1901
Evaluating Results - Average Loss: 6244.2009 | MSE: 6098.5850 | MAE: 10.0574



HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 10 | Time: 0m 11s
Training Results - Average Loss: 7699.7818 | MSE: 7711.5449 | MAE: 10.1669
Evaluating Results - Average Loss: 6226.5573 | MSE: 6080.8730 | MAE: 9.9568

Epoch    10: reducing learning rate of group 0 to 2.7000e-05.


HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 11 | Time: 0m 12s
Training Results - Average Loss: 7629.6264 | MSE: 7654.8965 | MAE: 10.0376
Evaluating Results - Average Loss: 6217.6678 | MSE: 6070.0161 | MAE: 9.9825



HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 12 | Time: 0m 11s
Training Results - Average Loss: 7592.0667 | MSE: 7646.8628 | MAE: 10.0799
Evaluating Results - Average Loss: 6217.2080 | MSE: 6073.6304 | MAE: 10.0388



HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 13 | Time: 0m 11s
Training Results - Average Loss: 7594.9610 | MSE: 7649.6030 | MAE: 10.1038
Evaluating Results - Average Loss: 6213.7713 | MSE: 6071.5845 | MAE: 10.0737

Epoch    13: reducing learning rate of group 0 to 8.1000e-06.


HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 14 | Time: 0m 11s
Training Results - Average Loss: 9047.3572 | MSE: 7639.0806 | MAE: 10.1394
Evaluating Results - Average Loss: 6213.8863 | MSE: 6073.2300 | MAE: 10.1040



HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 15 | Time: 0m 11s
Training Results - Average Loss: 7585.2035 | MSE: 7639.1748 | MAE: 10.1694
Evaluating Results - Average Loss: 6214.0435 | MSE: 6074.6450 | MAE: 10.1110



HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 16 | Time: 0m 12s
Training Results - Average Loss: 7577.0973 | MSE: 7637.3984 | MAE: 10.1740
Evaluating Results - Average Loss: 6212.3140 | MSE: 6073.0132 | MAE: 10.1096

Epoch    16: reducing learning rate of group 0 to 2.4300e-06.


HBox(children=(FloatProgress(value=0.0, description='Train', max=67.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=23.0, style=ProgressStyle(description_width='…


Epoch: 17 | Time: 0m 11s
Training Results - Average Loss: 7585.5283 | MSE: 7635.5894 | MAE: 10.1635
Evaluating Results - Average Loss: 6212.4488 | MSE: 6073.2061 | MAE: 10.1072

EarlyStopping



In [16]:
trainer = Trainer(model, train_iterator, valid_iterator, criterion, optimizer, scheduler, config, "08-multymodal-reg")
model.load_state_dict(torch.load(f'{config.save_dirname}/08-multymodal-reg.pth'))
trainer.set_model(model)
trainer.evaluate(train_iterator)

HBox(children=(FloatProgress(value=0.0, description='Valid', max=67.0, style=ProgressStyle(description_width='…


Epoch: evaluate | Time: 0m 4s
Evaluating Results - Average Loss: 7602.1846 | MSE: 7633.9058 | MAE: 10.1243

