In [3]:
import json, torch
from LoadData import load_dataset
from Model import Model
from EmotionDataset import EmotionDataset
from train import train
from plot import plot
from predict import predict
from sklearn.model_selection import KFold
import numpy as np
from analyze import analyze

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
with open("./config.json", "r") as file:
    config = json.load(file)
batch_size = config["batch_size"]
train_path = config["train_path"]
test_path = config["test_path"]
learning_rate = config["learning_rate"]
base_model = config["base_model"]
num_labels = config["num_labels"]
problem_type = config["problem_type"]
labels = config["LABELS"]
num_epoch = config["num_epoch"]
eps = config["eps"]
th = config["th"]
random_state = config["random_state"]
n_splits = config["n_splits"]
Min_loss = config["Min_loss"]
num_folds = config["num_fold"]

In [8]:
X, y, X_test, test_df = load_dataset(train_path, test_path)
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

In [9]:
from sklearn.metrics import f1_score
def find_threshold(model, valid_loader):
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 
    for th in np.arange(0.2, 0.7, 0.01):
        f1score_sum = 0
        for batch in valid_loader:
            batch = {k: v.to(device) for k, v in batch.items() if k != "text"}
            with torch.no_grad():
                outputs = model(**batch)
            logits = outputs.logits
            predictions = np.array(torch.sigmoid(logits).tolist())
            predictions = np.where(predictions > th, 1, 0)
            label = batch["labels"].cpu().numpy()
            f1score_sum += f1_score(label, predictions, average='macro', zero_division=1)
            
        f1score_avg = (f1score_sum / len(valid_loader)).item()
        print(f"threshold = {th:.3f}   -   f1-score = {f1score_avg:.5f}")

In [None]:
best_models = [] 
fold_valid_losses = [] 
wrong_predcit = []

for fold, (train_idx, val_idx) in enumerate(kfold.split(X)):
    X_train, X_valid = X[train_idx].tolist(), X[val_idx].tolist()
    y_train, y_valid = y[train_idx].tolist(), y[val_idx].tolist()
    model, tokenizer = Model(base_model, num_labels, problem_type, labels).build()
    train_loader, valid_loader, test_loader = EmotionDataset(tokenizer, batch_size).build_dataset(X_train, y_train, X_valid, y_valid, X_test)

    optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate, betas = (0.9, 0.98), eps = eps, weight_decay=0.01)
    model, train_losses, valid_losses, train_f1scores, valid_f1scores = train(num_epoch, model, train_loader, valid_loader, optimizer, th, Min_loss)
    wrong_predcit.append(analyze(model, valid_loader, th))
    #find_threshold(model, valid_loader)
    #plot(train_losses, valid_losses, "loss", "Loss training and valid curve", num_epoch)
    #plot(train_f1scores, valid_f1scores, "F1-score", "F1-score training and valid", num_epoch)
    
    Min_valid_loss = valid_losses[-1]
    fold_valid_losses.append(Min_valid_loss)

    if Min_valid_loss < Min_loss:
        model_save_path = f'model_fold_{fold + 1}_loss_{Min_valid_loss:.4f}.pt'
        torch.save({
            'fold': fold + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'valid_loss': Min_valid_loss,
            'train_losses': train_losses,
            'valid_losses': valid_losses,
            'train_f1scores': train_f1scores,
            'valid_f1scores': valid_f1scores
        }, model_save_path)
        best_models.append({
            'fold': fold + 1,
            'model': model,
            'loss': Min_valid_loss,
            'path': model_save_path
        })
        print(f"Saved model for fold {fold + 1} with validation loss: {Min_valid_loss:.4f}\n\n\n")
    if fold == num_folds-1:
        break

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
wrong_predcit[0]