In [None]:
import sys
import json

import torch
from torch.optim import Adam
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import numpy as np

if r'G:\PythonProjects\WineRecognition2' not in sys.path:
    sys.path.insert(0, r'G:\PythonProjects\WineRecognition2')
from nn.utils import CustomDataset, train, plot_losses, generate_tag_to_ix, get_model_confidence
from nn.model import BiLSTM_CRF
from nn.mlflow_utils import log_mlflow_on_train
from data_master import DataGenerator, count_unk_foreach_tag

In [None]:
MODEL_NAME = 'BiLSTM_CRF'
RUN_NAME = ''
START_TIME = ''
OUTPUT_DIR = ''
DATASET_PATH = r'G:\PythonProjects\WineRecognition2\data\text\halliday_winesearcher_menu_gen_samplesv2\Halliday_WineSearcher_MenuGenSamples.txt'
VOCAB_PATH = 'G:/PythonProjects/WineRecognition2/data/vocabs/Words_Halliday_Wine_AU.json'
DATAINFO_PATH = 'G:/PythonProjects/WineRecognition2/data_info.json'
DEVICE = 'cuda'
BATCH_SIZE = 128
EMBEDDING_DIM = 256
HIDDEN_DIM = 64
NUM_EPOCHS = 1
LEARNING_RATE = 0.01
SCHEDULER_FACTOR = 0.1
SCHEDULER_PATIENCE = 10
CASE_SENSITIVE_VOCAB = False
USE_NUM2WORDS = False
WEIGHT_DECAY = 0.0001
TEST_SIZE = 0.2

In [None]:
with open(DATASET_PATH, encoding='utf-8') as file:
    sents = DataGenerator.generate_sents2(file.read().split('\n'))
len(sents)

In [None]:
train_data, val_data = train_test_split(sents, test_size=TEST_SIZE)
len(train_data), len(val_data)

In [None]:
with open(DATAINFO_PATH) as file:
    tag_to_ix = generate_tag_to_ix(json.load(file)['keys']['all'])
tag_to_ix

In [None]:
with open(VOCAB_PATH, 'r', encoding='utf-8') as file:
    word_to_ix = json.load(file)
len(word_to_ix)

In [None]:
train_dataset = CustomDataset(
    train_data, tag_to_ix, word_to_ix, case_sensitive=CASE_SENSITIVE_VOCAB, convert_nums2words=USE_NUM2WORDS
)
val_dataset = CustomDataset(
    val_data, tag_to_ix, word_to_ix, case_sensitive=CASE_SENSITIVE_VOCAB, convert_nums2words=USE_NUM2WORDS
)

In [None]:
dataloaders = {
    'train': DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True),
    'val': DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)
}

In [None]:
vocab_size = len(word_to_ix)
model = BiLSTM_CRF(vocab_size, len(tag_to_ix), EMBEDDING_DIM, HIDDEN_DIM, padding_idx=word_to_ix['PAD']).to(DEVICE)
optimizer = Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
scheduler = ReduceLROnPlateau(optimizer, factor=SCHEDULER_FACTOR, patience=SCHEDULER_PATIENCE)

In [None]:
model, losses = train(
    model,
    optimizer,
    dataloaders,
    DEVICE,
    NUM_EPOCHS,
    OUTPUT_DIR,
    scheduler=scheduler,
    tqdm=tqdm
)

In [None]:
y_val_true = [tags for _, tags in val_dataset.raw_data()]

In [None]:
y_val_pred = []
tags = list(tag_to_ix.keys())
model.eval()
with torch.no_grad():
    for x_batch, y_batch, mask_batch, _ in dataloaders['val']:
        x_batch, mask_batch = x_batch.to(DEVICE), mask_batch.to(DEVICE)
        y_batch_pred = model(x_batch, mask_batch)
        y_val_pred.extend(y_batch_pred)
y_val_pred = [[tags[tag] for tag in sentence] for sentence in y_val_pred]

In [None]:
X_test = [
    torch.tensor(val_dataset.sentence_to_indices(sentence), dtype=torch.int64) for sentence, _ in val_dataset.raw_data()
]

In [None]:
unk_foreach_tag = count_unk_foreach_tag(X_test, y_val_true, list(tag_to_ix), val_dataset.word_to_ix[val_dataset.unk])

In [None]:
confs = get_model_confidence(model, X_test, DEVICE)

In [None]:
test_eval = [list(zip(sentence, tags, y_val_pred[index])) for index, (sentence, tags) in enumerate(val_dataset.raw_data())]

In [None]:
run_params = {
    'model_name': MODEL_NAME,
    'run_name': RUN_NAME,
    'start_time': START_TIME,
    'output_dir': OUTPUT_DIR,
    'dataset_path': DATASET_PATH,
    'vocab_path': VOCAB_PATH,
    'datainfo_path': DATAINFO_PATH,
    'device': DEVICE,
    'batch_size': BATCH_SIZE,
    'embedding_dim': EMBEDDING_DIM,
    'hidden_dim': HIDDEN_DIM,
    'vocab_size': vocab_size,
    'tags': ', '.join(tag_to_ix),
    'num_epochs': NUM_EPOCHS,
    'learning_rate': LEARNING_RATE,
    'weight_decay': WEIGHT_DECAY,
    'factor': SCHEDULER_FACTOR,
    'patience': SCHEDULER_PATIENCE,
    'case_sensitive': CASE_SENSITIVE_VOCAB,
    'test_size': TEST_SIZE,
    'models_confidence': np.mean(confs),
    'unk_foreach_tag': json.dumps(unk_foreach_tag),
    'use_num2words': USE_NUM2WORDS
}

In [None]:
log_mlflow_on_train(
    run_params=run_params,
    model=model,
    classes=list(tag_to_ix),
    losses=losses,
    y_true=y_val_true,
    y_pred=y_val_pred,
    test_eval=test_eval
)