In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd

import copy
import traceback
import datetime
import joblib
import re
import os
import random
import string
import time

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
%matplotlib inline

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

from tqdm.notebook import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import pymorphy2
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
class ConfigExperiment:
    seed = 42
    positive_file = "../data/positive.csv"
    negative_file = "../data/negative.csv"
    russian_stop_words = "../data/russian_stop_words.txt"
    english_stop_words = "../data/english_stop_words.txt"
    test_size = 0.3
    device = "cuda" if torch.cuda.is_available() else "cpu"
    batch_size = 2048
    num_epochs = 50
    lr = 1e-2
    num_workers = 0
    patience = 3
    early_stopping_delta = 1e-4
    save_dirname = "models"
    
config = ConfigExperiment()

In [4]:
def init_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic=True
    
init_random_seed(config.seed)

In [5]:
df = pd.read_csv("../data/preprocessed_text_v1.csv", index_col=False)

In [6]:
df.head()

Unnamed: 0,message,ttype
0,работа полный пиддес каждый закрытие месяц сви...,0
1,коллега сидеть рубиться urban terror долбать в...,0
2,говорят обещаной год ждать,0
3,желать хороший полёт удачный посадка быть очен...,0
4,обновить какой леший surf работать простоплеер,0


In [7]:
X_train, X_test, y_train, y_test = train_test_split(df['message'].values.astype('U'), df['ttype'], random_state=config.seed, test_size=config.test_size)

In [8]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((158783,), (68051,), (158783,), (68051,))

In [9]:
X_train = X_train.tolist()
X_test = X_test.tolist()
y_train = y_train.tolist()
y_test = y_test.tolist()

In [10]:
X_train[:10]

['luna самый самый любимый рождественский песенка год',
 'скачать симс лизин диск прийтись папка картинка поудалять пофига новый накачать',
 'появиться ощущение приближаться новое год ёлка радость поставить',
 'итак получить несколько зачёт неделя спасть усердно работать приболеть',
 'мозг кипеть спин разболеться',
 'хороший мотивация мысль стареть успеть',
 'равно мой солнышко просто разнообразие должный разный называть',
 'хороший учитель найти сложно мы ментор везти',
 'оказываться такой сладкое губа мммм forever alone',
 'дыы порнососа музыка слушать пытаться сам придумать фанфик']

In [11]:
%%time

pipe = Pipeline([
    ('count', CountVectorizer(ngram_range=(1, 2), min_df=1)),
    ('tfid', TfidfTransformer(norm="l2", smooth_idf=False, use_idf=True))
]).fit(X_train)

CPU times: user 4.55 s, sys: 165 ms, total: 4.71 s
Wall time: 4.71 s


In [12]:
%%time

tfidf_train = pipe.transform(X_train)
tfidf_test = pipe.transform(X_test)

CPU times: user 4.23 s, sys: 8.97 ms, total: 4.24 s
Wall time: 4.24 s


In [13]:
class LogisticRegression(nn.Module):
    
    def __init__(self, D_in, D_out):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(D_in, D_out)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        outputs = self.linear(x)
        y_pred = self.sigmoid(outputs.view(1, -1).squeeze(0))
        return y_pred
    
model = LogisticRegression(tfidf_train.shape[1], 1)
model.to(config.device)

LogisticRegression(
  (linear): Linear(in_features=650802, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [14]:
# Подсчет количества тренеруемых параметров модели
sum(p.numel() for p in model.parameters() if p.requires_grad)

650803

In [15]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=config.lr)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True, mode="max", factor=0.3)

In [16]:
class FeatureDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
        
    def __len__(self):
        return self.features.shape[0]
    
    def __getitem__(self, index):
        current_features = torch.from_numpy(self.features[index].toarray()).float()
        current_targets = torch.tensor(self.targets[index]).float()
        return current_features, current_targets
    
train_dataset = FeatureDataset(tfidf_train, y_train)
valid_dataset = FeatureDataset(tfidf_test, y_test)

train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers)
valid_dataloader = DataLoader(valid_dataset, batch_size=config.batch_size, shuffle=False, num_workers=config.num_workers)

In [17]:
class Trainer:
    def __init__(self, model, train_dataloader: DataLoader, valid_dataloader: DataLoader, criterion, optimizer, scheduler, config: ConfigExperiment):
        self.model = model
        self.train_dataloader = train_dataloader
        self.valid_dataloader = valid_dataloader
        self.criterion = criterion
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.device = config.device
        self.config = config
        self.max_train_iterations = 40
#         self.max_train_iterations = len(self.train_dataloader)
        self.max_valid_iterations = 10
#         self.max_valid_iterations = len(self.valid_dataloader)
        self.train_metrics = {
            'avg_loss': [],
            'accuracy': [],
            'f1': [],
        }
        self.valid_metrics = {
            'avg_loss': [],
            'accuracy': [],
            'f1': [],
        }
        self.counter = 0
        self.delta = config.early_stopping_delta
      
    def run(self):
        self.model.to(self.device)
        best_valid_loss = float('inf')
        best_target_metric = 0

        try:
            for i_epoch in tqdm(range(self.config.num_epochs), desc='Epochs', total=config.num_epochs, position=1, leave=True):
                start_time = time.time()

                train_loss, train_outputs, train_targets = self._train()
                valid_loss, valid_outputs, valid_targets = self._evaluate()
                    
                self.train_metrics["avg_loss"].append(train_loss)
                self.train_metrics["accuracy"].append(accuracy_score(train_targets, train_outputs.round()))
                self.train_metrics["f1"].append(f1_score(train_targets, train_outputs.round(), average="macro"))
                
                self.valid_metrics["avg_loss"].append(valid_loss)
                self.valid_metrics["accuracy"].append(accuracy_score(valid_targets, valid_outputs.round()))
                self.valid_metrics["f1"].append(f1_score(valid_targets, valid_outputs.round(), average="macro"))
                
                end_time = time.time()
                epoch_mins, epoch_secs = self._epoch_time(start_time, end_time)
                self.print_progress(i_epoch, epoch_mins, epoch_secs)
                
                self.scheduler.step(self.valid_metrics["f1"][-1])
                
                if valid_loss < best_valid_loss:
                    best_valid_loss = valid_loss
                    torch.save(model.state_dict(), f"{config.save_dirname}/best_model.pth")
                    
                if self.valid_metrics["f1"][-1] > best_target_metric:
                    self.counter = 0
                    best_target_metric = self.valid_metrics["f1"][-1]
                    torch.save(model.state_dict(), f"{config.save_dirname}/best_model.pth")
                else:
                    self.counter += 1
                    
                if self.counter > self.config.patience:
                    print("EarlyStopping")
                    break
        except KeyboardInterrupt:
            pass
        
        return self.train_metrics, self.valid_metrics
        
    def _train(self):
        model.train()
        epoch_loss = 0
        epoch_output = None
        epoch_target = None
        for i, (featurs, labels) in tqdm(enumerate(self.train_dataloader), desc='Train', total=self.max_train_iterations, position=2, leave=True):
            if i >= self.max_train_iterations:
                break
                
            loss_iten, outputs = self._train_process(featurs, labels)
            epoch_loss += loss_iten              

            if epoch_output is None:
                epoch_output = outputs.cpu().data
            else:
                epoch_output = torch.cat((epoch_output, outputs.cpu().data))

            if epoch_target is None:
                epoch_target = labels.cpu().data
            else:
                epoch_target = torch.cat((epoch_target, labels.cpu().data))
            
        return epoch_loss / len(self.train_dataloader), epoch_output, epoch_target
    
    def _train_process(self, featurs, labels):
        featurs = featurs.to(self.device)
        labels = labels.to(self.device)
        self.optimizer.zero_grad()
        outputs = self.model(featurs)
        loss = self.criterion(outputs, labels)
        loss.backward()
        self.optimizer.step()
        return loss.item(), outputs
            
    def _evaluate(self):
        model.eval()
        epoch_loss = 0
        epoch_output = None
        epoch_target = None
        with torch.no_grad():
            for i, (featurs, labels) in tqdm(enumerate(self.valid_dataloader), desc='Valid', total=self.max_valid_iterations, position=3, leave=True):
                if i >= self.max_valid_iterations:
                    break
                
                featurs = featurs.to(self.device)
                labels = labels.to(self.device)
                outputs = model(featurs)
                loss = criterion(outputs, labels)
                epoch_loss += loss.item()
                
                if epoch_output is None:
                    epoch_output = outputs.cpu().data
                else:
                    epoch_output = torch.cat((epoch_output, outputs.cpu().data))

                if epoch_target is None:
                    epoch_target = labels.cpu().data
                else:
                    epoch_target = torch.cat((epoch_target, labels.cpu().data))

        return epoch_loss / len(self.valid_dataloader), epoch_output, epoch_target
 
    def _epoch_time(self, start_time, end_time):
        elapsed_time = end_time - start_time
        elapsed_mins = int(elapsed_time / 60)
        elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
        return elapsed_mins, elapsed_secs

    def print_progress(self, i_epoch, epoch_mins, epoch_secs):
        i_epoch = i_epoch + 1
        print(f"Epoch: {i_epoch:02} | Time: {epoch_mins}m {epoch_secs}s")
        print("Training Results - Average Loss: {:.4f} | accuracy: {:.4f} | f1: {:.4f}"
            .format(
                self.train_metrics['avg_loss'][-1], 
                self.train_metrics['accuracy'][-1],
                self.train_metrics['f1'][-1],
            ))
        print("Evaluating Results - Average Loss: {:.4f} | accuracy: {:.4f} | f1: {:.4f}"
            .format( 
                self.valid_metrics['avg_loss'][-1],
                self.valid_metrics['accuracy'][-1],
                self.valid_metrics['f1'][-1],
            ))
        print()



In [18]:
trainer = Trainer(model, train_dataloader, valid_dataloader, criterion, optimizer, scheduler, config)
trainer.run();

HBox(children=(FloatProgress(value=0.0, description='Epochs', max=50.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 01 | Time: 2m 21s
Training Results - Average Loss: 0.3477 | accuracy: 0.6675 | f1: 0.6625
Evaluating Results - Average Loss: 0.1922 | accuracy: 0.7094 | f1: 0.7083



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 02 | Time: 2m 21s
Training Results - Average Loss: 0.3208 | accuracy: 0.8038 | f1: 0.8033
Evaluating Results - Average Loss: 0.1832 | accuracy: 0.7188 | f1: 0.7161



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 03 | Time: 2m 16s
Training Results - Average Loss: 0.2962 | accuracy: 0.8444 | f1: 0.8441
Evaluating Results - Average Loss: 0.1762 | accuracy: 0.7271 | f1: 0.7265



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 04 | Time: 2m 12s
Training Results - Average Loss: 0.2743 | accuracy: 0.8717 | f1: 0.8716
Evaluating Results - Average Loss: 0.1712 | accuracy: 0.7309 | f1: 0.7303



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 05 | Time: 2m 10s
Training Results - Average Loss: 0.2561 | accuracy: 0.8868 | f1: 0.8867
Evaluating Results - Average Loss: 0.1674 | accuracy: 0.7321 | f1: 0.7318



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 06 | Time: 2m 16s
Training Results - Average Loss: 0.2395 | accuracy: 0.9020 | f1: 0.9019
Evaluating Results - Average Loss: 0.1644 | accuracy: 0.7327 | f1: 0.7324



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 07 | Time: 2m 17s
Training Results - Average Loss: 0.2247 | accuracy: 0.9139 | f1: 0.9139
Evaluating Results - Average Loss: 0.1620 | accuracy: 0.7347 | f1: 0.7344



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 08 | Time: 2m 17s
Training Results - Average Loss: 0.2114 | accuracy: 0.9227 | f1: 0.9227
Evaluating Results - Average Loss: 0.1601 | accuracy: 0.7360 | f1: 0.7357



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 09 | Time: 2m 17s
Training Results - Average Loss: 0.2002 | accuracy: 0.9298 | f1: 0.9298
Evaluating Results - Average Loss: 0.1586 | accuracy: 0.7358 | f1: 0.7356



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 10 | Time: 2m 15s
Training Results - Average Loss: 0.1898 | accuracy: 0.9357 | f1: 0.9357
Evaluating Results - Average Loss: 0.1574 | accuracy: 0.7368 | f1: 0.7365



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 11 | Time: 2m 14s
Training Results - Average Loss: 0.1796 | accuracy: 0.9426 | f1: 0.9425
Evaluating Results - Average Loss: 0.1564 | accuracy: 0.7371 | f1: 0.7369



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 12 | Time: 2m 11s
Training Results - Average Loss: 0.1711 | accuracy: 0.9470 | f1: 0.9470
Evaluating Results - Average Loss: 0.1556 | accuracy: 0.7372 | f1: 0.7369



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 13 | Time: 2m 16s
Training Results - Average Loss: 0.1629 | accuracy: 0.9514 | f1: 0.9514
Evaluating Results - Average Loss: 0.1550 | accuracy: 0.7374 | f1: 0.7370



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 14 | Time: 2m 14s
Training Results - Average Loss: 0.1550 | accuracy: 0.9553 | f1: 0.9553
Evaluating Results - Average Loss: 0.1546 | accuracy: 0.7372 | f1: 0.7369



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 15 | Time: 2m 16s
Training Results - Average Loss: 0.1487 | accuracy: 0.9585 | f1: 0.9585
Evaluating Results - Average Loss: 0.1542 | accuracy: 0.7375 | f1: 0.7373



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 16 | Time: 2m 17s
Training Results - Average Loss: 0.1421 | accuracy: 0.9615 | f1: 0.9615
Evaluating Results - Average Loss: 0.1540 | accuracy: 0.7365 | f1: 0.7363



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 17 | Time: 2m 16s
Training Results - Average Loss: 0.1362 | accuracy: 0.9650 | f1: 0.9650
Evaluating Results - Average Loss: 0.1538 | accuracy: 0.7365 | f1: 0.7362



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 18 | Time: 2m 29s
Training Results - Average Loss: 0.1305 | accuracy: 0.9674 | f1: 0.9674
Evaluating Results - Average Loss: 0.1537 | accuracy: 0.7357 | f1: 0.7356



HBox(children=(FloatProgress(value=0.0, description='Train', max=40.0, style=ProgressStyle(description_width='…




HBox(children=(FloatProgress(value=0.0, description='Valid', max=10.0, style=ProgressStyle(description_width='…


Epoch: 19 | Time: 2m 21s
Training Results - Average Loss: 0.1254 | accuracy: 0.9690 | f1: 0.9690
Evaluating Results - Average Loss: 0.1537 | accuracy: 0.7345 | f1: 0.7344

Epoch    19: reducing learning rate of group 0 to 3.0000e-03.
EarlyStopping



In [19]:
model = LogisticRegression(tfidf_train.shape[1], 1)
# model.load_state_dict(torch.load("models/best_model.pth"))
# torch.save(model.state_dict(), "models/torch_baseline_logistic_regression.pth")
model.load_state_dict(torch.load("models/torch_baseline_logistic_regression.pth"))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

LogisticRegression(
  (linear): Linear(in_features=650802, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [20]:
results_by_batch = []
labels = []
with torch.no_grad():
    for batch_x, batch_y in tqdm(valid_dataloader, total=len(valid_dataset) / config.batch_size):
        batch_x = batch_x.to(config.device)
        batch_pred = model(batch_x)
        results_by_batch.append(batch_pred.detach().cpu().numpy())
        
y_preds = np.concatenate(results_by_batch, 0)

HBox(children=(FloatProgress(value=0.0, max=33.22802734375), HTML(value='')))




In [21]:
print('accuracy score: ',accuracy_score(y_test, y_preds.round()))
print('\n')
print('confusion matrix: \n',confusion_matrix(y_test,y_preds.round()))
print('\n')
print(classification_report(y_test, y_preds.round()))

accuracy score:  0.7323331031138411


confusion matrix: 
 [[24168  9221]
 [ 8994 25668]]


              precision    recall  f1-score   support

           0       0.73      0.72      0.73     33389
           1       0.74      0.74      0.74     34662

    accuracy                           0.73     68051
   macro avg       0.73      0.73      0.73     68051
weighted avg       0.73      0.73      0.73     68051



In [22]:
f1_score(y_test, y_preds.round(), average="macro")

0.7322029905549916