In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd

import copy
import traceback
import datetime
import joblib
import re
import os
import random
import string
import time

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
%matplotlib inline

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torchtext import data
from torchtext import datasets
from torchtext import vocab
from torchtext.vocab import Vectors, GloVe

from tqdm.notebook import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import pymorphy2
from sklearn.base import BaseEstimator, TransformerMixin

import gensim

In [3]:
class ConfigExperiment:
    seed = 42
    positive_file = "../data/positive.csv"
    negative_file = "../data/negative.csv"
    device = "cuda" if torch.cuda.is_available() else "cpu"
    embed_dim = 300
    max_vocab_size = 50_000
    batch_size = 256
    num_epochs = 30
    lr = 1e-2
    num_workers = 0
    patience = 5
    early_stopping_delta = 1e-4
    save_dirname = "models"
    
config = ConfigExperiment()

In [4]:
def init_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic=True
    
init_random_seed(config.seed)

In [5]:
train = pd.read_csv("../data/train_processed_data.csv", index_col=False)
validate = pd.read_csv("../data/validate_processed_data.csv", index_col=False)
test = pd.read_csv("../data/test_processed_data.csv", index_col=False)

tokenize = lambda x: str(x).split()
TEXT = data.Field(sequential=True, tokenize=tokenize, batch_first=True)
LABEL = data.Field(sequential=False, use_vocab=False, dtype=torch.float)

fields = [('text',TEXT), ('label', LABEL)]

train_data, valid_data, test_data = data.TabularDataset.splits(
    path="../data/",
    train="train_processed_data.csv",
    validation="validate_processed_data.csv",
    test="test_processed_data.csv",
    format="csv",
    fields=fields,
    skip_header=True)

TEXT.build_vocab(train_data, min_freq=2)

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    sort_key = lambda x: x.text,
    batch_size=config.batch_size,
    device=config.device)

### Скачать и рапаковать предобученные веса

In [6]:
# from zipfile import ZipFile
# import wget

# model_url = 'http://vectors.nlpl.eu/repository/11/187.zip'
# wget.download(model_url)
# with ZipFile('187.zip', 'r') as zipObj:
#    # Extract all the contents of zip file in different directory
#    zipObj.extractall('187')

### Составить матрицу предобученных весов для словаря 

In [7]:
w2v_model = gensim.models.KeyedVectors.load('187/model.model')
numpy_embeddings = np.zeros(shape=[len(TEXT.vocab), config.embed_dim],dtype=np.float32)

for word in TEXT.vocab.itos:
    vector = w2v_model.get_vector(word)
    index  = TEXT.vocab.stoi[word]
    numpy_embeddings[index] = vector
    
pretrained_embeddings = torch.Tensor(numpy_embeddings).float()
pretrained_embeddings.shape

torch.Size([33041, 300])

### Создание модели

In [8]:
class CNN1d(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels = embedding_dim, 
                      out_channels = n_filters, 
                      kernel_size = fs)
            for fs in filter_sizes
        ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.permute(0, 2, 1)
        
        #embedded = [batch size, emb dim, sent len]
        
        conved = [F.relu(conv(embedded)) for conv in self.convs]
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))
        
        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [9]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = config.embed_dim
N_FILTERS = 128
FILTER_SIZES = [2, 3]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

In [10]:
model = CNN1d(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [11]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 10,104,813 trainable parameters


### Копирование предобученных весов в модель

In [12]:
model.embedding.weight.data.copy_(pretrained_embeddings)

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [13]:
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2, verbose=True, mode="max", factor=0.3)

model = model.to(config.device)
criterion = criterion.to(config.device)

In [35]:
class Trainer:
    def __init__(self, model, train_dataloader: DataLoader, valid_dataloader: DataLoader, 
                 criterion, optimizer, scheduler, config: ConfigExperiment, model_name: str):
        self.model = model
        self.train_dataloader = train_dataloader
        self.valid_dataloader = valid_dataloader
        self.criterion = criterion
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.device = config.device
        self.config = config
        self.threshold = 0.5
        self.model_name = model_name
        self.train_metrics = {
            'avg_loss': [],
            'accuracy': [],
            'f1': [],
        }
        self.valid_metrics = {
            'avg_loss': [],
            'accuracy': [],
            'f1': [],
        }
        self.counter = 0
        self.delta = config.early_stopping_delta
      
    def run(self):
        self.model.to(self.device)
        best_valid_loss = float('inf')
        best_target_metric = 0

        try:
            for i_epoch in tqdm(range(self.config.num_epochs), desc='Epochs', total=config.num_epochs, position=1, leave=True):
                start_time = time.time()

                train_loss, train_outputs, train_targets = self._train()
                valid_loss, valid_outputs, valid_targets = self._evaluate()
                    
                self.train_metrics["avg_loss"].append(train_loss)
                self.train_metrics["accuracy"].append(accuracy_score(train_targets, train_outputs.round() > self.threshold))
                self.train_metrics["f1"].append(f1_score(train_targets, train_outputs.round() > self.threshold, average="macro"))
                
                self.valid_metrics["avg_loss"].append(valid_loss)
                self.valid_metrics["accuracy"].append(accuracy_score(valid_targets, valid_outputs.round() > self.threshold))
                self.valid_metrics["f1"].append(f1_score(valid_targets, valid_outputs.round() > self.threshold, average="macro"))
                
                end_time = time.time()
                epoch_mins, epoch_secs = self._epoch_time(start_time, end_time)
                self.print_progress(i_epoch, epoch_mins, epoch_secs)
                
                self.scheduler.step(self.valid_metrics["f1"][-1])
                
                if valid_loss < best_valid_loss:
                    best_valid_loss = valid_loss
                    torch.save(model.state_dict(), f"{config.save_dirname}/{self.model_name}.pth")
                    
                if self.valid_metrics["f1"][-1] > best_target_metric:
                    self.counter = 0
                    best_target_metric = self.valid_metrics["f1"][-1]
                    torch.save(model.state_dict(), f"{config.save_dirname}/{self.model_name}.pth")
                else:
                    self.counter += 1
                    
                if self.counter > self.config.patience:
                    print("EarlyStopping")
                    break
        except KeyboardInterrupt:
            pass
        
        return self.train_metrics, self.valid_metrics
        
    def _train(self):
        model.train()
        epoch_loss = 0
        epoch_output = None
        epoch_target = None
        for i, batch in tqdm(enumerate(self.train_dataloader), desc='Train', total=len(self.train_dataloader), position=2, leave=True):
            loss_iten, outputs = self._train_process(batch)
            epoch_loss += loss_iten 

            if epoch_output is None:
                epoch_output = outputs.cpu().data
            else:
                epoch_output = torch.cat((epoch_output, outputs.cpu().data))

            if epoch_target is None:
                epoch_target = batch.label.cpu().data
            else:
                epoch_target = torch.cat((epoch_target, batch.label.cpu().data))
            
        return epoch_loss / len(self.train_dataloader), epoch_output, epoch_target
    
    def _train_process(self, batch):      
        self.optimizer.zero_grad()
        outputs = self.model(batch.text).squeeze(1)
        loss = self.criterion(outputs, batch.label)
        loss.backward()
        self.optimizer.step()
        return loss.item(), outputs
            
    def _evaluate(self):
        self.model.eval()
        epoch_loss = 0
        epoch_output = None
        epoch_target = None
        with torch.no_grad():
            for i, batch in tqdm(enumerate(self.valid_dataloader), desc='Valid', total=len(self.valid_dataloader), position=3, leave=True):
                outputs = self.model(batch.text).squeeze(1)
                loss = criterion(outputs, batch.label)
                epoch_loss += loss.item()
                
                if epoch_output is None:
                    epoch_output = outputs.cpu().data
                else:
                    epoch_output = torch.cat((epoch_output, outputs.cpu().data))

                if epoch_target is None:
                    epoch_target = batch.label.cpu().data
                else:
                    epoch_target = torch.cat((epoch_target, batch.label.cpu().data))

        return epoch_loss / len(self.valid_dataloader), epoch_output, epoch_target
 
    def _epoch_time(self, start_time, end_time):
        elapsed_time = end_time - start_time
        elapsed_mins = int(elapsed_time / 60)
        elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
        return elapsed_mins, elapsed_secs

    def print_progress(self, i_epoch, epoch_mins, epoch_secs):
        if type(i_epoch) != str:
            i_epoch = i_epoch + 1
            print(f"Epoch: {i_epoch:02} | Time: {epoch_mins}m {epoch_secs}s")
            print("Training Results - Average Loss: {:.4f} | accuracy: {:.4f} | f1: {:.4f}"
                .format(
                    self.train_metrics['avg_loss'][-1], 
                    self.train_metrics['accuracy'][-1],
                    self.train_metrics['f1'][-1],
                ))      
        else:
            print(f"Epoch: {i_epoch} | Time: {epoch_mins}m {epoch_secs}s")
        print("Evaluating Results - Average Loss: {:.4f} | accuracy: {:.4f} | f1: {:.4f}"
            .format( 
                self.valid_metrics['avg_loss'][-1],
                self.valid_metrics['accuracy'][-1],
                self.valid_metrics['f1'][-1],
            ))
        print()

    def set_model(self, model: nn.Module):
        self.model = model
        
    def evaluate(self, dataloader: DataLoader):
        self.valid_dataloader = dataloader
        self.model.to(self.device)
        start_time = time.time()

        valid_loss, valid_outputs, valid_targets = self._evaluate()

        self.valid_metrics["avg_loss"].append(valid_loss)
        self.valid_metrics["accuracy"].append(accuracy_score(valid_targets, valid_outputs.round() > self.threshold))
        self.valid_metrics["f1"].append(f1_score(valid_targets, valid_outputs.round() > self.threshold, average="macro"))

        end_time = time.time()
        epoch_mins, epoch_secs = self._epoch_time(start_time, end_time)
        self.print_progress("evaluate", epoch_mins, epoch_secs)


In [15]:
trainer = Trainer(model, train_iterator, valid_iterator, criterion, optimizer, scheduler, config, "05-conv1d-clf")
trainer.run();

HBox(children=(FloatProgress(value=0.0, description='Epochs', max=30.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Train', max=532.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=178.0, style=ProgressStyle(description_width=…


Epoch: 01 | Time: 0m 10s
Training Results - Average Loss: 0.2643 | accuracy: 0.8613 | f1: 0.8611
Evaluating Results - Average Loss: 0.0206 | accuracy: 0.9913 | f1: 0.9913



HBox(children=(FloatProgress(value=0.0, description='Train', max=532.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=178.0, style=ProgressStyle(description_width=…


Epoch: 02 | Time: 0m 10s
Training Results - Average Loss: 0.0203 | accuracy: 0.9915 | f1: 0.9915
Evaluating Results - Average Loss: 0.0158 | accuracy: 0.9933 | f1: 0.9933



HBox(children=(FloatProgress(value=0.0, description='Train', max=532.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=178.0, style=ProgressStyle(description_width=…


Epoch: 03 | Time: 0m 10s
Training Results - Average Loss: 0.0147 | accuracy: 0.9936 | f1: 0.9936
Evaluating Results - Average Loss: 0.0161 | accuracy: 0.9935 | f1: 0.9935



HBox(children=(FloatProgress(value=0.0, description='Train', max=532.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=178.0, style=ProgressStyle(description_width=…


Epoch: 04 | Time: 0m 10s
Training Results - Average Loss: 0.0124 | accuracy: 0.9949 | f1: 0.9949
Evaluating Results - Average Loss: 0.0166 | accuracy: 0.9937 | f1: 0.9937



HBox(children=(FloatProgress(value=0.0, description='Train', max=532.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=178.0, style=ProgressStyle(description_width=…


Epoch: 05 | Time: 0m 10s
Training Results - Average Loss: 0.0098 | accuracy: 0.9961 | f1: 0.9961
Evaluating Results - Average Loss: 0.0232 | accuracy: 0.9927 | f1: 0.9927



HBox(children=(FloatProgress(value=0.0, description='Train', max=532.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=178.0, style=ProgressStyle(description_width=…


Epoch: 06 | Time: 0m 10s
Training Results - Average Loss: 0.0088 | accuracy: 0.9966 | f1: 0.9966
Evaluating Results - Average Loss: 0.0175 | accuracy: 0.9938 | f1: 0.9938



HBox(children=(FloatProgress(value=0.0, description='Train', max=532.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=178.0, style=ProgressStyle(description_width=…


Epoch: 07 | Time: 0m 10s
Training Results - Average Loss: 0.0073 | accuracy: 0.9972 | f1: 0.9972
Evaluating Results - Average Loss: 0.0181 | accuracy: 0.9940 | f1: 0.9940



HBox(children=(FloatProgress(value=0.0, description='Train', max=532.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=178.0, style=ProgressStyle(description_width=…


Epoch: 08 | Time: 0m 10s
Training Results - Average Loss: 0.0068 | accuracy: 0.9975 | f1: 0.9975
Evaluating Results - Average Loss: 0.0192 | accuracy: 0.9938 | f1: 0.9938



HBox(children=(FloatProgress(value=0.0, description='Train', max=532.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=178.0, style=ProgressStyle(description_width=…


Epoch: 09 | Time: 0m 10s
Training Results - Average Loss: 0.0059 | accuracy: 0.9979 | f1: 0.9979
Evaluating Results - Average Loss: 0.0213 | accuracy: 0.9941 | f1: 0.9941



HBox(children=(FloatProgress(value=0.0, description='Train', max=532.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=178.0, style=ProgressStyle(description_width=…


Epoch: 10 | Time: 0m 10s
Training Results - Average Loss: 0.0051 | accuracy: 0.9983 | f1: 0.9983
Evaluating Results - Average Loss: 0.0230 | accuracy: 0.9941 | f1: 0.9941

Epoch    10: reducing learning rate of group 0 to 3.0000e-04.


HBox(children=(FloatProgress(value=0.0, description='Train', max=532.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=178.0, style=ProgressStyle(description_width=…


Epoch: 11 | Time: 0m 10s
Training Results - Average Loss: 0.0032 | accuracy: 0.9989 | f1: 0.9989
Evaluating Results - Average Loss: 0.0229 | accuracy: 0.9939 | f1: 0.9939



HBox(children=(FloatProgress(value=0.0, description='Train', max=532.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=178.0, style=ProgressStyle(description_width=…


Epoch: 12 | Time: 0m 10s
Training Results - Average Loss: 0.0023 | accuracy: 0.9993 | f1: 0.9993
Evaluating Results - Average Loss: 0.0225 | accuracy: 0.9941 | f1: 0.9941



HBox(children=(FloatProgress(value=0.0, description='Train', max=532.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=178.0, style=ProgressStyle(description_width=…


Epoch: 13 | Time: 0m 10s
Training Results - Average Loss: 0.0019 | accuracy: 0.9994 | f1: 0.9994
Evaluating Results - Average Loss: 0.0244 | accuracy: 0.9939 | f1: 0.9939

Epoch    13: reducing learning rate of group 0 to 9.0000e-05.


HBox(children=(FloatProgress(value=0.0, description='Train', max=532.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=178.0, style=ProgressStyle(description_width=…


Epoch: 14 | Time: 0m 10s
Training Results - Average Loss: 0.0016 | accuracy: 0.9995 | f1: 0.9995
Evaluating Results - Average Loss: 0.0236 | accuracy: 0.9939 | f1: 0.9939



HBox(children=(FloatProgress(value=0.0, description='Train', max=532.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=178.0, style=ProgressStyle(description_width=…


Epoch: 15 | Time: 0m 10s
Training Results - Average Loss: 0.0013 | accuracy: 0.9996 | f1: 0.9996
Evaluating Results - Average Loss: 0.0243 | accuracy: 0.9939 | f1: 0.9939

EarlyStopping



In [36]:
trainer = Trainer(model, train_iterator, valid_iterator, criterion, optimizer, scheduler, config, "05-conv1d-clf")
model.load_state_dict(torch.load(f'{config.save_dirname}/05-conv1d-clf.pth'))
trainer.set_model(model)
trainer.evaluate(train_iterator)

HBox(children=(FloatProgress(value=0.0, description='Valid', max=532.0, style=ProgressStyle(description_width=…


Epoch: evaluate | Time: 0m 3s
Evaluating Results - Average Loss: 0.0020 | accuracy: 0.9993 | f1: 0.9993

