In [1]:
import numpy as np
import nltk
from tqdm.notebook import tqdm
from glob import glob

from transformers import AutoTokenizer, AutoModel, BertTokenizer, BertModel

import torch
from torch.utils.data import Dataset, DataLoader
import torch.functional as F
from torch import nn
import torchmetrics
import pytorch_lightning as pl

from warnings import filterwarnings
filterwarnings("ignore")

## Data

In [2]:
from model import CustomDataset

In [3]:
# Load tokenizer
tokenizer_bert = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
tokenizer_lstm = nltk.RegexpTokenizer(r"[а-я]+|<unk>|<pad>")

# Load data
train_data = glob("data/augmentations/train/*.npy")
val_data = glob("data/augmentations/val/*.npy")
test_data = glob("data/augmentations/test/*.npy")
test_pseudo = glob("data/augmentations/test_pseudo/*.npy")

print(f"Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")

Train: 118631, Val: 5088, Test: 50651


## Finetune LSTM

In [7]:
from model import init_RUBert, LSTMModel
import optuna

In [6]:
sent_size = 112
batch_size = 128

# data
dataset_train = CustomDataset(train_data, tokenizer_bert, tokenizer_lstm, sent_size=sent_size,
                              train_mode=True, model_type="lstm")
dataset_val = CustomDataset(val_data, tokenizer_bert, tokenizer_lstm, sent_size=sent_size,
                            train_mode=True, model_type="lstm")
dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
dataloader_val = DataLoader(dataset_val, batch_size=batch_size, shuffle=True)

In [8]:
def objective(trial: optuna.trial.Trial):

    # params
    lr = trial.suggest_float("lr", 2e-6, 2e-4)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-1)
    hidden_size = trial.suggest_int("hidden_size", 2, 256)
    bidirectional = trial.suggest_categorical("bidirectional", [True, False])
    dropout_lstm = trial.suggest_float("dropout_lstm", 0.0, 0.6)
    dropout_linear = trial.suggest_float("dropout_linear", 0.0, 0.6)
    linear1_meta = trial.suggest_int("linear1_meta", 32, 1024)
    linear2_size = trial.suggest_int("linear2_size", 32, 1024)

    
    params = {"lr": lr, "weight_decay": weight_decay, 
              "hidden_size": hidden_size,  "bidirectional": bidirectional,
              "dropout_lstm":dropout_lstm, "dropout_linear":dropout_linear,
              "linear1_meta":linear1_meta, "linear2_size":linear2_size}
    
    # model
    model = LSTMModel(**params)
    
    # model utils
    lr_monitoring = pl.callbacks.LearningRateMonitor(logging_interval="epoch")
    early_stop_callback = pl.callbacks.early_stopping.EarlyStopping(monitor="val_f1", min_delta=0.0001, patience=3,
                                                                    verbose=False, mode="max")
    logger = pl.loggers.TensorBoardLogger(save_dir="logs", name="final_model")
    
    # train
    trainer = pl.Trainer(gpus=1, max_epochs=15, callbacks=[lr_monitoring, early_stop_callback],
                         default_root_dir="data/", weights_summary=None, num_sanity_val_steps=0)
    trainer.logger.log_hyperparams(params)
    trainer.fit(model, dataloader_train, dataloader_val)
    
    return trainer.callback_metrics["val_f1"].item()

In [39]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, timeout=6*60*60)

In [10]:
study.best_params

{'lr': 0.00019966166384916635,
 'weight_decay': 0.021622317536040474,
 'hidden_size': 207,
 'bidirectional': True,
 'dropout_lstm': 0.5877457997686522,
 'dropout_linear': 0.2027970994869876,
 'linear1_meta': 325,
 'linear2_size': 739}

## Finetune LSTM Pseudo

In [4]:
from model import init_RUBert, LSTMModel
import optuna

In [5]:
sent_size = 112
batch_size = 128

# data
dataset_train = CustomDataset(train_data+test_pseudo, tokenizer_bert, tokenizer_lstm, sent_size=sent_size,
                              train_mode=True, model_type="lstm")
dataset_val = CustomDataset(val_data, tokenizer_bert, tokenizer_lstm, sent_size=sent_size,
                            train_mode=True, model_type="lstm")
dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
dataloader_val = DataLoader(dataset_val, batch_size=batch_size, shuffle=True)

In [6]:
for x in dataloader_train:
    break

In [7]:
def objective(trial: optuna.trial.Trial):

    # params
    lr = trial.suggest_float("lr", 2e-5, 2e-3)
    weight_decay = trial.suggest_float("weight_decay", 1e-3, 1e-1)
    hidden_size = trial.suggest_int("hidden_size", 124, 512)
    bidirectional = trial.suggest_categorical("bidirectional", [True, False])
    dropout_lstm = trial.suggest_float("dropout_lstm", 0.2, 0.6)
    dropout_linear = trial.suggest_float("dropout_linear", 0.2, 0.6)
    linear1_meta = trial.suggest_int("linear1_meta", 124, 1024)
    linear2_size = trial.suggest_int("linear2_size", 124, 1024)

    
    params = {"lr": lr, "weight_decay": weight_decay, 
              "hidden_size": hidden_size,  "bidirectional": bidirectional,
              "dropout_lstm":dropout_lstm, "dropout_linear":dropout_linear,
              "linear1_meta":linear1_meta, "linear2_size":linear2_size}
    
    # model
    model = LSTMModel(**params)
    
    # model utils
    lr_monitoring = pl.callbacks.LearningRateMonitor(logging_interval="epoch")
    early_stop_callback = pl.callbacks.early_stopping.EarlyStopping(monitor="val_f1", min_delta=0.0001, patience=3,
                                                                    verbose=False, mode="max")
    logger = pl.loggers.TensorBoardLogger(save_dir="logs", name="final_model")
    
    # train
    trainer = pl.Trainer(gpus=1, max_epochs=15, callbacks=[lr_monitoring, early_stop_callback],
                         default_root_dir="data/", weights_summary=None, num_sanity_val_steps=0)
    trainer.logger.log_hyperparams(params)
    trainer.fit(model, dataloader_train, dataloader_val)
    
    return trainer.callback_metrics["val_f1"].item()

In [1]:
# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=100, timeout=6*60*60)

In [9]:
study.best_params

{'lr': 0.0007465276400229775,
 'weight_decay': 0.06902483087263139,
 'hidden_size': 394,
 'bidirectional': True,
 'dropout_lstm': 0.22293407982191252,
 'dropout_linear': 0.235525995182581,
 'linear1_meta': 849,
 'linear2_size': 585}

## Finetune threshold

In [2]:
from model import CustomDataset
from model import init_RUBert, LSTMModel

In [3]:
# Load tokenizer
tokenizer_bert = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
tokenizer_lstm = nltk.RegexpTokenizer(r"[а-я]+|<unk>|<pad>")

# Load data
val_data = glob("data/augmentations/val/*.npy")

# Dataloader
sent_size = 112
batch_size = 128

# data
dataset_val = CustomDataset(val_data, tokenizer_bert, tokenizer_lstm, sent_size=sent_size,
                            train_mode=True, model_type="lstm")
dataloader_val = DataLoader(dataset_val, batch_size=batch_size, shuffle=True)

In [10]:
# Collect labels
y_true = []

for x in dataloader_val:
    y_true.extend(x["lstm"][-1].numpy().tolist())

In [12]:
# load model
model = LSTMModel.load_from_checkpoint("data/models/Final_Model_lstm.ckpt")
trainer = pl.Trainer(gpus=1)

# preds
preds = trainer.predict(model, dataloader_val)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

In [35]:
import optuna
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

def objective(trial: optuna.trial.Trial):

    # params
    th1 = trial.suggest_float("th1", 1e-6, 0.999)
    th2 = trial.suggest_float("th2", 1e-6, 0.999)
    th3 = trial.suggest_float("th3", 1e-6, 0.999)
    th4 = trial.suggest_float("th4", 1e-6, 0.999)
    th5 = trial.suggest_float("th5", 1e-6, 0.999)
    th6 = trial.suggest_float("th6", 1e-6, 0.999)
    th7 = trial.suggest_float("th7", 1e-6, 0.999)
    th8 = trial.suggest_float("th8", 1e-6, 0.999)
    th9 = trial.suggest_float("th9", 1e-6, 0.999)
    
    # get preds
    y_pred = []
    thresholds = [th1, th2, th3, th4, th5, th6, th7, th8, th9]

    for pred in tqdm(preds):
        pred = (pred.numpy() > thresholds).astype(int)
        y_pred.extend(pred)
    
    score = f1_score(y_true, y_pred, average="samples")
    
    return score

In [38]:
# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=200, show_progress_bar=True)

In [37]:
study.best_params

{'th1': 5.234075621788313e-05,
 'th2': 0.5370042055765364,
 'th3': 0.06012091756440158,
 'th4': 0.8309953474038814,
 'th5': 0.9066367495515553,
 'th6': 0.5978267628247755,
 'th7': 0.596118816153346,
 'th8': 0.21056445096106394,
 'th9': 1.0570610009207923e-05}