In [1]:
import numpy as np
from tqdm.notebook import tqdm
from glob import glob
import nltk

from transformers import AutoTokenizer, AutoModel, BertTokenizer, BertModel

import torch
from torch.utils.data import Dataset, DataLoader
import torch.functional as F
from torch import nn
import torchmetrics
import pytorch_lightning as pl

from warnings import filterwarnings
filterwarnings("ignore")

## Data

In [2]:
from model import CustomDataset

In [3]:
# Load tokenizer
tokenizer_bert = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
tokenizer_lstm = nltk.RegexpTokenizer(r"[а-я]+|<unk>|<pad>")

# Load data
train_data = glob("data/augmentations/train/*.npy")
val_data = glob("data/augmentations/val/*.npy")
test_data = glob("data/augmentations/test/*.npy")
test_pseudo = glob("data/augmentations/test_pseudo/*.npy")

print(f"Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")

Train: 118631, Val: 5088, Test: 50651


## Train LSTM Model

In [4]:
from model import LSTMModel

In [5]:
from sklearn.model_selection import train_test_split
train_data, val_data = train_test_split(train_data + val_data)

In [6]:
sent_size = 112
batch_size = 128

# data
dataset_train = CustomDataset(train_data, tokenizer_bert, tokenizer_lstm, sent_size=sent_size,
                              train_mode=True, model_type="lstm")
dataset_val = CustomDataset(val_data, tokenizer_bert, tokenizer_lstm, sent_size=sent_size,
                            train_mode=True, model_type="lstm")
dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
dataloader_val = DataLoader(dataset_val, batch_size=batch_size, shuffle=True)

In [None]:
params = {'lr': 0.00019966166384916635, 'weight_decay': 0.021622317536040474,
          'hidden_size': 207, 'bidirectional': True, 'dropout_lstm': 0.5877457997686522,
          'dropout_linear': 0.2027970994869876, 'linear1_meta': 325, 
          'linear2_size': 739}

    
# model
model = LSTMModel(**params)

# model utils
lr_monitoring = pl.callbacks.LearningRateMonitor(logging_interval="epoch")
checkpoint = pl.callbacks.ModelCheckpoint(monitor="val_loss", mode = "min",
                                          dirpath="data/models", filename="lstm_checkpoint")
early_stop_callback = pl.callbacks.early_stopping.EarlyStopping(monitor="val_f1", min_delta=0.0001, patience=5,
                                                                verbose=False, mode="max")
logger = pl.loggers.TensorBoardLogger(save_dir="logs", name="lstm_model", version="optimized")

# train
trainer = pl.Trainer(gpus=1, max_epochs=15, logger=logger,
                     callbacks=[lr_monitoring, early_stop_callback],
                     default_root_dir="data/", weights_summary=None, num_sanity_val_steps=0)
trainer.fit(model, dataloader_train, dataloader_val)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch     4: reducing learning rate of group 0 to 1.9966e-05.


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch     7: reducing learning rate of group 0 to 1.9966e-06.


Validating: 0it [00:00, ?it/s]

In [None]:
# save model
trainer.save_checkpoint("data/models/Final_Model_lstm.ckpt", weights_only=True)

## Train LSTM Pseudo Labeling

In [9]:
from model import LSTMModel

In [10]:
from sklearn.model_selection import train_test_split
train_data, val_data = train_test_split(train_data + val_data)

In [11]:
sent_size = 112
batch_size = 128

# data
dataset_train = CustomDataset(train_data + test_pseudo, tokenizer_bert, tokenizer_lstm, sent_size=sent_size,
                              train_mode=True, model_type="lstm")
dataset_val = CustomDataset(val_data, tokenizer_bert, tokenizer_lstm, sent_size=sent_size,
                            train_mode=True, model_type="lstm")
dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
dataloader_val = DataLoader(dataset_val, batch_size=batch_size, shuffle=True)

In [12]:
params = {'lr': 0.0007465276400229775, 'weight_decay': 0.06902483087263139,
          'hidden_size': 394, 'bidirectional': True, 'dropout_lstm': 0.22293407982191252,
          'dropout_linear': 0.235525995182581, 'linear1_meta': 849, 'linear2_size': 585}

    
# model
model = LSTMModel(**params)

# model utils
lr_monitoring = pl.callbacks.LearningRateMonitor(logging_interval="epoch")
checkpoint = pl.callbacks.ModelCheckpoint(monitor="val_loss", mode = "min",
                                          dirpath="data/models", filename="lstm_checkpoint")
early_stop_callback = pl.callbacks.early_stopping.EarlyStopping(monitor="val_f1", min_delta=0.0001, patience=5,
                                                                verbose=False, mode="max")
logger = pl.loggers.TensorBoardLogger(save_dir="logs", name="lstm_model", version="pseudo")

# train
trainer = pl.Trainer(gpus=1, max_epochs=15, logger=logger,
                     callbacks=[lr_monitoring, early_stop_callback],
                     default_root_dir="data/", weights_summary=None, num_sanity_val_steps=0)
trainer.fit(model, dataloader_train, dataloader_val)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch     4: reducing learning rate of group 0 to 7.4653e-05.


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch     7: reducing learning rate of group 0 to 7.4653e-06.


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch    10: reducing learning rate of group 0 to 7.4653e-07.


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch    13: reducing learning rate of group 0 to 7.4653e-08.


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [13]:
# save model
trainer.save_checkpoint("data/models/Final_Model_lstm_pseudo.ckpt", weights_only=True)

## Train Model Not Retrained

In [4]:
from model import init_RUBert, Model

In [5]:
sent_size = 112
batch_size = 8

# data
dataset_train = CustomDataset(train_data, tokenizer_bert, tokenizer_lstm, sent_size=sent_size,
                              train_mode=True, model_type="bert")
dataset_val = CustomDataset(val_data, tokenizer_bert, tokenizer_lstm, sent_size=sent_size,
                            train_mode=True, model_type="bert")
dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
dataloader_val = DataLoader(dataset_val, batch_size=batch_size, shuffle=True)

In [7]:
# model
params = {"lr": 2e-5, "weight_decay": 1e-3,  "is_train": False, "linear1_meta_size": 512,
          "linear1_token_size":512, "linear2_size":1024, "dropout1_weight":0.2, "dropout2_weight":0.3}
model = Model(**params)

# model utils
checkpoint = pl.callbacks.ModelCheckpoint(monitor="val_loss", mode = "min",
                                          dirpath="data/models", filename="final_model_checkpoint")
lr_monitoring = pl.callbacks.LearningRateMonitor(logging_interval="epoch")
early_stop_callback = pl.callbacks.early_stopping.EarlyStopping(monitor="val_f1", min_delta=0.00001,
                                                                patience=5, verbose=True, mode="max")
logger = pl.loggers.TensorBoardLogger(save_dir="logs", name="final_model", version="not_retrained")

# train
trainer = pl.Trainer(gpus=1, max_epochs=15, logger=logger, accumulate_grad_batches=16,
                     callbacks=[lr_monitoring, checkpoint],
                     default_root_dir="data/", weights_summary=None, num_sanity_val_steps=0)
trainer.fit(model, dataloader_train, dataloader_val)

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOC

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_f1 improved. New best score: 0.784


Validating: 0it [00:00, ?it/s]

Metric val_f1 improved by 0.009 >= min_delta = 1e-05. New best score: 0.792


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch     4: reducing learning rate of group 0 to 2.0000e-06.


Validating: 0it [00:00, ?it/s]

Metric val_f1 improved by 0.005 >= min_delta = 1e-05. New best score: 0.798


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch     7: reducing learning rate of group 0 to 2.0000e-07.


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Monitored metric val_f1 did not improve in the last 4 records. Best score: 0.798. Signaling Trainer to stop.


In [8]:
# save model
trainer.save_checkpoint("data/models/Final_Model_notretrained.ckpt", weights_only=True)

## Train Retrained Model

In [5]:
from model import init_RUBert, Model

In [6]:
sent_size = 112
batch_size = 8

# data
dataset_train = CustomDataset(train_data+test_pseudo, tokenizer_bert, tokenizer_lstm, sent_size=sent_size,
                              train_mode=True, model_type="bert")
dataset_val = CustomDataset(val_data, tokenizer_bert, tokenizer_lstm, sent_size=sent_size,
                            train_mode=True, model_type="bert")
dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
dataloader_val = DataLoader(dataset_val, batch_size=batch_size, shuffle=True)

In [7]:
# model
params = {"lr": 2e-5, "weight_decay": 1e-3,  "is_train": True, "linear1_meta_size": 512,
          "linear1_token_size":512, "linear2_size":1024, "dropout1_weight":0.2, "dropout2_weight":0.3}
model = Model(**params)

# model utils
checkpoint = pl.callbacks.ModelCheckpoint(monitor="val_loss", mode = "min",
                                          dirpath="data/models", filename="final_model_checkpoint")
lr_monitoring = pl.callbacks.LearningRateMonitor(logging_interval="epoch")
early_stop_callback = pl.callbacks.early_stopping.EarlyStopping(monitor="val_f1", min_delta=0.00001,
                                                                patience=5, verbose=True, mode="max")
logger = pl.loggers.TensorBoardLogger(save_dir="logs", name="final_model", version="retrained_pseudo")

# train
trainer = pl.Trainer(gpus=1, max_epochs=15, min_epochs=7, logger=logger, accumulate_grad_batches=16,
                     callbacks=[lr_monitoring, checkpoint],
                     default_root_dir="data/", weights_summary=None)
trainer.fit(model, dataloader_train, dataloader_val)

Downloading:   0%|          | 0.00/680M [00:00<?, ?B/s]

Some weights of the model checkpoint at Skoltech/russian-sensitive-topics were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch    12: reducing learning rate of group 0 to 2.0000e-06.


Validating: 0it [00:00, ?it/s]

In [9]:
# save model
trainer.save_checkpoint("data/models/Final_Model_retrained_pseudo.ckpt", weights_only=True)