# Начальная инициализация

In [1]:
import datetime
print(datetime.datetime.now().strftime('%d-%B-%Y %H:%M:%S'))

26-May-2023 18:38:09


In [2]:
# Подключение библиотек

# !pip install torch torchvision torchaudio
# !pip install lightning
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data_utils
import lightning.pytorch as pl
import lightning as L

# !pip install scikit-learn
from sklearn.model_selection import ParameterGrid

import torch.optim as optim
# !pip install pandas
import pandas as pd
import numpy as np


import os
import random

In [3]:
# Подключение вычислений на видеокарту, если доступна
if torch.cuda.is_available():
    dev = "cuda:0"
else:
    dev = "cpu"
device = torch.device(dev)
print(device)

cuda:0


In [4]:
cpu_count = os.cpu_count()

num_workers = cpu_count if device == torch.device("cpu") else 0
num_workers, cpu_count

(0, 4)

In [5]:
random_state = 42

torch.use_deterministic_algorithms(True)

%env CUBLAS_WORKSPACE_CONFIG=:4096:8
%env PYTHONHASHSEED=42

def set_random_state(random_state):
    torch.manual_seed(random_state)
    random.seed(random_state)
    np.random.seed(random_state)

    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(random_state)
        torch.cuda.manual_seed(random_state)

        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
set_random_state(random_state)

pl.seed_everything(random_state, workers=True)

env: CUBLAS_WORKSPACE_CONFIG=:4096:8
env: PYTHONHASHSEED=42


Global seed set to 42


42

In [2]:
print(datetime.datetime.now().strftime('%d-%B-%Y %H:%M:%S'))

26-May-2023 18:38:14


# Обучение нейронных сетей

In [7]:
import pickle
PATH = 'dumps/'
# with open(PATH + 'X_train_ft_pre_trained.pkl', mode='rb') as file:
    # X_train_ft_pre_trained = pickle.load(file)

# with open(PATH + 'X_train_w2v_pre_trained.pkl', mode='rb') as file:
    # X_train_w2v_pre_trained = pickle.load(file)

# with open(PATH + 'X_test_w2v_pre_trained.pkl', mode='rb') as file:
    # X_test_w2v_pre_trained = pickle.load(file)

# with open(PATH + 'X_test_w2v_self_trained.pkl', mode='rb') as file:
    # X_test_w2v_self_trained = pickle.load(file)

with open(PATH + 'WordsIdPad_train.pkl', mode='rb') as file:
    WordsIdPad_train = pickle.load(file)

# with open(PATH + 'X_test_ft_pre_trained.pkl', mode='rb') as file:
    # X_test_ft_pre_trained = pickle.load(file)

# with open(PATH + 'X_train_ft_self_trained.pkl', mode='rb') as file:
    # X_train_ft_self_trained = pickle.load(file)

with open(PATH + 'WordsIdPad_test.pkl', mode='rb') as file:
    WordsIdPad_test = pickle.load(file)

# with open(PATH + 'X_test_ft_self_trained.pkl', mode='rb') as file:
    # X_test_ft_self_trained = pickle.load(file)

# with open(PATH + 'X_train_w2v_self_trained.pkl', mode='rb') as file:
    # X_train_w2v_self_trained = pickle.load(file)

with open(PATH + 'y_train.pkl', mode='rb') as file:
    y_train = pickle.load(file)

with open(PATH + 'y_test.pkl', mode='rb') as file:
    y_test = pickle.load(file)



In [8]:
param_grid = {
    # "embeddings": ['w2v_pretrained', 'ft_pretrained', 'w2v_selftrained', 'ft_selftrained', 'torch.nn'],
    "activation_fn": [F.relu],
    "learning_rate": [0.001],
    "epochs": [5, 10, 20, 50],
    "batch_size": [1024, 2048],
    "optimizer": [optim.AdamW],
    "layers_count": [i for i in range(1, 6)],
    "type_rnn": [torch.nn.RNN, torch.nn.LSTM, torch.nn.GRU],
    "hidden_size": [100, 300],
    "bidirectional": [False, True],
}

params_list = ParameterGrid(param_grid)
len(params_list)

480

In [9]:
from sklearn.model_selection import train_test_split

class NetData(pl.LightningDataModule):
    def __init__(self, train_features=None, test_features=None, train_targets=None, test_targets=None, batch_size=None, random_state=None):
        super().__init__()
        
        self.batch_size = batch_size
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(train_features, train_targets, random_state=random_state)
        self.X_test, self.y_test = test_features, test_targets

    def setup(self, stage=None):
        features_train = torch.tensor(self.X_train, dtype=torch.int32)
        targets_train = torch.tensor(self.y_train, device=device, dtype=torch.int32)
    
        features_val = torch.tensor(self.X_val, dtype=torch.int32)
        targets_val = torch.tensor(self.y_val, device=device, dtype=torch.int32)
    
        features_test = torch.tensor(self.X_test, dtype=torch.int32)
        targets_test = torch.tensor(self.y_test, dtype=torch.int32)
    
        self.trainset = data_utils.TensorDataset(features_train, targets_train)
        self.valset = data_utils.TensorDataset(features_val, targets_val)
        self.testset = data_utils.TensorDataset(features_test, targets_test)
        
    def train_dataloader(self):
        return data_utils.DataLoader(self.trainset, batch_size=self.batch_size)
    def val_dataloader(self):
        return data_utils.DataLoader(self.valset, batch_size=self.batch_size)
    def test_dataloader(self):
        return data_utils.DataLoader(self.testset, batch_size=self.batch_size)

In [10]:
from torchmetrics.classification import F1Score

class Net(pl.LightningModule):
    def __init__(self,
                 layers_count=None, 
                 activation_function=None,
                 type_rnn=None,
                 hidden_size=None,
                 bidirectional=None,
                 optimizer=None,
                 learningRate=None
                ):
        
        super().__init__()

        
        self.embedding_size = 300
        self.num_words = 10000
        self.seq_len = 20
        self.optim = optimizer
        self.learningRate = learningRate

        self.f1 = F1Score(task='multiclass', num_classes=2, multidim_average='global', average='weighted')
        
        self.embedding = nn.Embedding(self.num_words + 1, self.embedding_size, padding_idx=0)
        
        self.fa = activation_function
        
        self.rnn = type_rnn(
                input_size = self.embedding_size,
                hidden_size = hidden_size,
                num_layers = layers_count,
                bidirectional = bidirectional,
                batch_first = True
        )
        
        self.fc = nn.Linear(hidden_size * self.seq_len * (1+bidirectional), 2)
        
        

    def forward(self, x):
        x = self.embedding(x)
        x = x.reshape(len(x), self.seq_len, self.embedding_size)
        
        x, size = self.rnn(x)
        
        x = self.fc(x.reshape(x.shape[0], -1))
        
        return F.log_softmax(x, dim=-1)
    
    def configure_optimizers(self):
        optimizer = self.optim(self.parameters(), lr=self.learningRate)
        return optimizer
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y.long())
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        pred = self(x)
        self.log("val_f1", self.f1(pred, y), on_epoch=True)

    def test_step(self, batch, batch_idx):
        x, y = batch
        pred = self(x)
        self.log("test_f1", self.f1(pred, y), on_epoch=True)

In [11]:
from IPython.display import clear_output
from tqdm import tqdm
total = []

for params in tqdm(params_list):
    
    clear_output(wait=True)  

    learningRate = params['learning_rate']
    optimizer_type = params['optimizer']
    layers_count = params['layers_count']
    batch_size = params['batch_size']
    epochs = params['epochs']
    fa = params['activation_fn']
    type_rnn = params['type_rnn']
    hidden_size = params['hidden_size']
    bidirectional = params['bidirectional']


    net = Net(
        layers_count=layers_count,
        activation_function=fa,
        type_rnn=type_rnn,
        hidden_size=hidden_size,
        bidirectional=bidirectional,
        optimizer=optimizer_type,
        learningRate=learningRate
        )
    dm = NetData(
        train_features=WordsIdPad_train,
        test_features=WordsIdPad_test,
        train_targets=y_train,
        test_targets=y_test,
        batch_size=batch_size,
        random_state=random_state
    )
    trainer = L.Trainer(logger=False, max_epochs=epochs, enable_progress_bar=True, deterministic=True, inference_mode=True, enable_checkpointing=False)
    trainer.fit(net, datamodule=dm)
    f1_val = trainer.validate(net, datamodule=dm)[-1]['val_f1']
    f1_test = trainer.test(net, datamodule=dm)[-1]['test_f1']
    
    total.append({
        'Type RNN': type_rnn.__name__,
        'hidden_size': hidden_size,
        'optimizer': optimizer_type.__name__,
        'batch_size': batch_size,
        'bidirectional': bidirectional,
        'Количество слоев': layers_count,
        'Количество эпох': epochs,
        'F1-val': round(f1_val, 4),
        'F1-test': round(f1_test, 4)
    })



GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type              | Params
------------------------------------------------
0 | f1        | MulticlassF1Score | 0     
1 | embedding | Embedding         | 3.0 M 
2 | rnn       | GRU               | 7.6 M 
3 | fc        | Linear            | 24.0 K
------------------------------------------------
10.6 M    Trainable params
0         Non-trainable params
10.6 M    Total params
42.409    Total estimated model params size (MB)


Epoch 49: 100%|██████████| 9/9 [00:04<00:00,  1.86it/s]                    

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 9/9 [00:04<00:00,  1.86it/s]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation DataLoader 0: 100%|██████████| 3/3 [00:00<00:00,  6.97it/s]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 84.36it/s]


100%|██████████| 480/480 [5:23:02<00:00, 40.38s/it] 


In [3]:
print(datetime.datetime.now().strftime('%d-%B-%Y %H:%M:%S'))

26-May-2023 18:38:17


# Сводная таблица

In [13]:
pd.set_option('display.max_rows', None)
summary = pd.DataFrame.from_dict(total)
summary_sort = summary.sort_values(by='F1-val', ascending=False)
summary_sort

Unnamed: 0,Type RNN,hidden_size,optimizer,batch_size,bidirectional,Количество слоев,Количество эпох,F1-val,F1-test
130,LSTM,100,AdamW,1024,True,4,5,0.7312,0.713
1,LSTM,100,AdamW,1024,False,1,5,0.7312,0.7729
400,LSTM,100,AdamW,2048,True,4,10,0.7303,0.6785
10,LSTM,100,AdamW,1024,False,4,5,0.7298,0.7759
133,LSTM,100,AdamW,1024,True,5,5,0.7296,0.7186
4,LSTM,100,AdamW,1024,False,2,5,0.7294,0.7186
478,LSTM,300,AdamW,2048,True,5,50,0.7286,0.7759
127,LSTM,100,AdamW,1024,True,3,5,0.7282,0.7316
148,LSTM,300,AdamW,1024,True,5,5,0.7279,0.76
262,LSTM,300,AdamW,2048,False,3,5,0.7278,0.7204
