In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd

import copy
import traceback
import datetime
import joblib
import re
import os
import random
import string
import time

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
%matplotlib inline

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

from tqdm.notebook import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import pymorphy2
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
class ConfigExperiment:
    seed = 42
    positive_file = "../data/positive.csv"
    negative_file = "../data/negative.csv"
    russian_stop_words = "../data/russian_stop_words.txt"
    english_stop_words = "../data/english_stop_words.txt"
    test_size = 0.3
    device = "cuda" if torch.cuda.is_available() else "cpu"
    batch_size = 2048
    num_epochs = 50
    lr = 1e-2
    num_workers = 0
    patience = 3
    early_stopping_delta = 1e-4
    save_dirname = "models"
    
config = ConfigExperiment()

In [4]:
def init_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic=True
    
init_random_seed(config.seed)

In [5]:
train = pd.read_csv("../data/train_processed_data.csv", index_col=False)
test = pd.read_csv("../data/test_processed_data.csv", index_col=False)

In [6]:
X_train, X_test = train['text'].values.tolist(), test['text'].values.tolist()
y_train, y_test = train['target'].values.tolist(), test['target'].values.tolist()

In [7]:
X_train[:10]

['оставаться самый нужный и самый близкие ) весь остальной уходить ) и я только рада ) потому что я никогда сам не понять нужный я человек или нет )',
 'такой приятный чувство , когда ты знаешь , что подарить человек и ты на уверить , что он быть рад ! : ) ) теперь ждать новый год ! : )',
 'день начинаться с лень вообще ничто делать не хотеть . даже рука шевелить ( ( ( ничто пройти . . .',
 'at_user at_user ксюша поход вплотную там суп заняться )',
 'at_user с днём рождение at_user , творческий успех ты ! )',
 '1 вопрос , ответ . ничто лишний . защищать курсач секунда ) )',
 'весь , пора отказаться от кофе , а то иначе мой ближний будущее это жёлтый зуб ( (',
 'at_user at_user я снова чувствовать как рушиться мой гениальный план . . ( ( ( рахманинааа , виза лишний нет ? : в я срочно ! ! !',
 'at_user а он не говорить кто он быть спрашивать ну или что ? просто как бы два группа сразу ( (',
 'неделя ад начаться ; ( ( как же не хотеться вставать ( (']

In [8]:
%%time

pipe = Pipeline([
    ('count', CountVectorizer(ngram_range=(1, 2), min_df=1)),
    ('tfid', TfidfTransformer(norm="l2", smooth_idf=False, use_idf=True))
]).fit(X_train)

CPU times: user 5.53 s, sys: 107 ms, total: 5.63 s
Wall time: 5.63 s


In [9]:
%%time

tfidf_train = pipe.transform(X_train)
tfidf_test = pipe.transform(X_test)

CPU times: user 4.39 s, sys: 9.6 ms, total: 4.4 s
Wall time: 4.4 s


In [10]:
class LogisticRegression(nn.Module):
    
    def __init__(self, D_in, D_out):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(D_in, D_out)

    def forward(self, x):
        outputs = self.linear(x)
        y_pred = torch.sigmoid(outputs.view(1, -1).squeeze(0))
        return y_pred
    
model = LogisticRegression(tfidf_train.shape[1], 1)
model.to(config.device)

LogisticRegression(
  (linear): Linear(in_features=679838, out_features=1, bias=True)
)

In [11]:
# Подсчет количества тренеруемых параметров модели
sum(p.numel() for p in model.parameters() if p.requires_grad)

679839

In [12]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=config.lr)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True, mode="max", factor=0.3)

In [13]:
class FeatureDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
        
    def __len__(self):
        return self.features.shape[0]
    
    def __getitem__(self, index):
        current_features = torch.from_numpy(self.features[index].toarray()).float()
        current_targets = torch.tensor(self.targets[index]).float()
        return current_features, current_targets
    
train_dataset = FeatureDataset(tfidf_train, y_train)
valid_dataset = FeatureDataset(tfidf_test, y_test)

train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers)
valid_dataloader = DataLoader(valid_dataset, batch_size=config.batch_size, shuffle=False, num_workers=config.num_workers)

In [14]:
class Trainer:
    def __init__(self, model, train_dataloader: DataLoader, valid_dataloader: DataLoader, criterion, optimizer, scheduler, config: ConfigExperiment):
        self.model = model
        self.train_dataloader = train_dataloader
        self.valid_dataloader = valid_dataloader
        self.criterion = criterion
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.device = config.device
        self.config = config
        self.max_train_iterations = 40
#         self.max_train_iterations = len(self.train_dataloader)
        self.max_valid_iterations = 10
#         self.max_valid_iterations = len(self.valid_dataloader)
        self.train_metrics = {
            'avg_loss': [],
            'accuracy': [],
            'f1': [],
        }
        self.valid_metrics = {
            'avg_loss': [],
            'accuracy': [],
            'f1': [],
        }
        self.counter = 0
        self.delta = config.early_stopping_delta
      
    def run(self):
        self.model.to(self.device)
        best_valid_loss = float('inf')
        best_target_metric = 0

        try:
            for i_epoch in tqdm(range(self.config.num_epochs), desc='Epochs', total=config.num_epochs, position=1, leave=True):
                start_time = time.time()

                train_loss, train_outputs, train_targets = self._train()
                valid_loss, valid_outputs, valid_targets = self._evaluate()
                    
                self.train_metrics["avg_loss"].append(train_loss)
                self.train_metrics["accuracy"].append(accuracy_score(train_targets, train_outputs.round()))
                self.train_metrics["f1"].append(f1_score(train_targets, train_outputs.round(), average="macro"))
                
                self.valid_metrics["avg_loss"].append(valid_loss)
                self.valid_metrics["accuracy"].append(accuracy_score(valid_targets, valid_outputs.round()))
                self.valid_metrics["f1"].append(f1_score(valid_targets, valid_outputs.round(), average="macro"))
                
                end_time = time.time()
                epoch_mins, epoch_secs = self._epoch_time(start_time, end_time)
                self.print_progress(i_epoch, epoch_mins, epoch_secs)
                
                self.scheduler.step(self.valid_metrics["f1"][-1])
                
                if valid_loss < best_valid_loss:
                    best_valid_loss = valid_loss
                    torch.save(model.state_dict(), f"{config.save_dirname}/best_model.pth")
                    
                if self.valid_metrics["f1"][-1] > best_target_metric:
                    self.counter = 0
                    best_target_metric = self.valid_metrics["f1"][-1]
                    torch.save(model.state_dict(), f"{config.save_dirname}/best_model.pth")
                else:
                    self.counter += 1
                    
                if self.counter > self.config.patience:
                    print("EarlyStopping")
                    break
        except KeyboardInterrupt:
            pass
        
        return self.train_metrics, self.valid_metrics
        
    def _train(self):
        model.train()
        epoch_loss = 0
        epoch_output = None
        epoch_target = None
        for i, (featurs, labels) in tqdm(enumerate(self.train_dataloader), desc='Train', total=self.max_train_iterations, position=2, leave=True):
            if i >= self.max_train_iterations:
                break
                
            loss_iten, outputs = self._train_process(featurs, labels)
            epoch_loss += loss_iten              

            if epoch_output is None:
                epoch_output = outputs.cpu().data
            else:
                epoch_output = torch.cat((epoch_output, outputs.cpu().data))

            if epoch_target is None:
                epoch_target = labels.cpu().data
            else:
                epoch_target = torch.cat((epoch_target, labels.cpu().data))
            
        return epoch_loss / len(self.train_dataloader), epoch_output, epoch_target
    
    def _train_process(self, featurs, labels):
        featurs = featurs.to(self.device)
        labels = labels.to(self.device)
        self.optimizer.zero_grad()
        outputs = self.model(featurs)
        loss = self.criterion(outputs, labels)
        loss.backward()
        self.optimizer.step()
        return loss.item(), outputs
            
    def _evaluate(self):
        model.eval()
        epoch_loss = 0
        epoch_output = None
        epoch_target = None
        with torch.no_grad():
            for i, (featurs, labels) in tqdm(enumerate(self.valid_dataloader), desc='Valid', total=self.max_valid_iterations, position=3, leave=True):
                if i >= self.max_valid_iterations:
                    break
                
                featurs = featurs.to(self.device)
                labels = labels.to(self.device)
                outputs = model(featurs)
                loss = criterion(outputs, labels)
                epoch_loss += loss.item()
                
                if epoch_output is None:
                    epoch_output = outputs.cpu().data
                else:
                    epoch_output = torch.cat((epoch_output, outputs.cpu().data))

                if epoch_target is None:
                    epoch_target = labels.cpu().data
                else:
                    epoch_target = torch.cat((epoch_target, labels.cpu().data))

        return epoch_loss / len(self.valid_dataloader), epoch_output, epoch_target
 
    def _epoch_time(self, start_time, end_time):
        elapsed_time = end_time - start_time
        elapsed_mins = int(elapsed_time / 60)
        elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
        return elapsed_mins, elapsed_secs

    def print_progress(self, i_epoch, epoch_mins, epoch_secs):
        i_epoch = i_epoch + 1
        print(f"Epoch: {i_epoch:02} | Time: {epoch_mins}m {epoch_secs}s")
        print("Training Results - Average Loss: {:.4f} | accuracy: {:.4f} | f1: {:.4f}"
            .format(
                self.train_metrics['avg_loss'][-1], 
                self.train_metrics['accuracy'][-1],
                self.train_metrics['f1'][-1],
            ))
        print("Evaluating Results - Average Loss: {:.4f} | accuracy: {:.4f} | f1: {:.4f}"
            .format( 
                self.valid_metrics['avg_loss'][-1],
                self.valid_metrics['accuracy'][-1],
                self.valid_metrics['f1'][-1],
            ))
        print()



In [15]:
trainer = Trainer(model, train_dataloader, valid_dataloader, criterion, optimizer, scheduler, config)
trainer.run();

HBox(children=(FloatProgress(value=0.0, description='Epochs', max=50.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 01 | Time: 2m 38s
Training Results - Average Loss: 0.4001 | accuracy: 0.6874 | f1: 0.6847
Evaluating Results - Average Loss: 0.2783 | accuracy: 0.7256 | f1: 0.7241



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 02 | Time: 2m 44s
Training Results - Average Loss: 0.3584 | accuracy: 0.8204 | f1: 0.8198
Evaluating Results - Average Loss: 0.2617 | accuracy: 0.7372 | f1: 0.7369



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 03 | Time: 2m 44s
Training Results - Average Loss: 0.3237 | accuracy: 0.8574 | f1: 0.8572
Evaluating Results - Average Loss: 0.2501 | accuracy: 0.7419 | f1: 0.7415



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 04 | Time: 2m 44s
Training Results - Average Loss: 0.2950 | accuracy: 0.8800 | f1: 0.8798
Evaluating Results - Average Loss: 0.2416 | accuracy: 0.7471 | f1: 0.7469



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 05 | Time: 2m 41s
Training Results - Average Loss: 0.2710 | accuracy: 0.8967 | f1: 0.8966
Evaluating Results - Average Loss: 0.2353 | accuracy: 0.7507 | f1: 0.7506



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 06 | Time: 2m 38s
Training Results - Average Loss: 0.2502 | accuracy: 0.9118 | f1: 0.9118
Evaluating Results - Average Loss: 0.2305 | accuracy: 0.7536 | f1: 0.7535



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 07 | Time: 2m 38s
Training Results - Average Loss: 0.2331 | accuracy: 0.9222 | f1: 0.9222
Evaluating Results - Average Loss: 0.2267 | accuracy: 0.7553 | f1: 0.7553



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 08 | Time: 2m 33s
Training Results - Average Loss: 0.2168 | accuracy: 0.9333 | f1: 0.9332
Evaluating Results - Average Loss: 0.2237 | accuracy: 0.7572 | f1: 0.7572



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 09 | Time: 2m 39s
Training Results - Average Loss: 0.2036 | accuracy: 0.9389 | f1: 0.9389
Evaluating Results - Average Loss: 0.2213 | accuracy: 0.7580 | f1: 0.7579



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 10 | Time: 2m 41s
Training Results - Average Loss: 0.1908 | accuracy: 0.9472 | f1: 0.9472
Evaluating Results - Average Loss: 0.2193 | accuracy: 0.7576 | f1: 0.7575



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 11 | Time: 2m 35s
Training Results - Average Loss: 0.1793 | accuracy: 0.9528 | f1: 0.9528
Evaluating Results - Average Loss: 0.2177 | accuracy: 0.7592 | f1: 0.7591



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 12 | Time: 2m 42s
Training Results - Average Loss: 0.1695 | accuracy: 0.9566 | f1: 0.9566
Evaluating Results - Average Loss: 0.2164 | accuracy: 0.7598 | f1: 0.7597



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 13 | Time: 2m 44s
Training Results - Average Loss: 0.1602 | accuracy: 0.9618 | f1: 0.9618
Evaluating Results - Average Loss: 0.2154 | accuracy: 0.7602 | f1: 0.7600



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 14 | Time: 2m 42s
Training Results - Average Loss: 0.1521 | accuracy: 0.9646 | f1: 0.9646
Evaluating Results - Average Loss: 0.2145 | accuracy: 0.7600 | f1: 0.7599



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 15 | Time: 2m 46s
Training Results - Average Loss: 0.1442 | accuracy: 0.9682 | f1: 0.9682
Evaluating Results - Average Loss: 0.2139 | accuracy: 0.7605 | f1: 0.7604



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 16 | Time: 2m 48s
Training Results - Average Loss: 0.1370 | accuracy: 0.9711 | f1: 0.9711
Evaluating Results - Average Loss: 0.2134 | accuracy: 0.7599 | f1: 0.7597



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 17 | Time: 2m 40s
Training Results - Average Loss: 0.1306 | accuracy: 0.9734 | f1: 0.9734
Evaluating Results - Average Loss: 0.2130 | accuracy: 0.7583 | f1: 0.7582



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 18 | Time: 2m 41s
Training Results - Average Loss: 0.1249 | accuracy: 0.9752 | f1: 0.9751
Evaluating Results - Average Loss: 0.2127 | accuracy: 0.7579 | f1: 0.7578



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 19 | Time: 2m 39s
Training Results - Average Loss: 0.1186 | accuracy: 0.9779 | f1: 0.9778
Evaluating Results - Average Loss: 0.2125 | accuracy: 0.7580 | f1: 0.7579

Epoch    19: reducing learning rate of group 0 to 3.0000e-03.
EarlyStopping



In [21]:
model = LogisticRegression(tfidf_train.shape[1], 1)
model.load_state_dict(torch.load("models/best_model.pth"))
torch.save(model.state_dict(), "models/03_pytorch_pipeline_linear_classifier_v2.pth")
model.load_state_dict(torch.load("models/03_pytorch_pipeline_linear_classifier_v2.pth"))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

LogisticRegression(
  (linear): Linear(in_features=650802, out_features=1, bias=True)
)

In [22]:
results_by_batch = []
labels = []
with torch.no_grad():
    for batch_x, batch_y in tqdm(valid_dataloader, total=len(valid_dataset) / config.batch_size):
        batch_x = batch_x.to(config.device)
        batch_pred = model(batch_x)
        results_by_batch.append(batch_pred.detach().cpu().numpy())
        
y_preds = np.concatenate(results_by_batch, 0)

HBox(children=(FloatProgress(value=0.0, max=33.22802734375), HTML(value='')))




In [23]:
print('accuracy score: ',accuracy_score(y_test, y_preds.round()))
print('\n')
print('confusion matrix: \n',confusion_matrix(y_test,y_preds.round()))
print('\n')
print(classification_report(y_test, y_preds.round()))

accuracy score:  0.7316424446371104


confusion matrix: 
 [[23966  9423]
 [ 8839 25823]]


              precision    recall  f1-score   support

           0       0.73      0.72      0.72     33389
           1       0.73      0.74      0.74     34662

    accuracy                           0.73     68051
   macro avg       0.73      0.73      0.73     68051
weighted avg       0.73      0.73      0.73     68051



In [24]:
f1_score(y_test, y_preds.round(), average="macro")

0.731442462073553