In [None]:
import os
import gc
import re
from typing import List, Tuple, Dict, Any, Optional
from itertools import product

import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from peft import LoraConfig, get_peft_model, PeftModel
from sklearn.metrics import f1_score, jaccard_score, hamming_loss

BASE_PATH = "XED/processed"
OUTPUT_ROOT_DIR = "./fi_gpt_results"
LOG_ROOT_DIR = "./logs_finnish_gpt3"
WEIGHTS_ROOT_DIR = "./weights/fi_gpt_results"
FINAL_SAVE_DIR = os.path.join(WEIGHTS_ROOT_DIR, "lora_gpt3_fi_best_final")
RESULTS_FILE = "./results_lora_finnish_gpt3_experiments.csv"

MODEL_NAME = "TurkuNLP/gpt3-finnish-medium"
LANGUAGE = "fi"
ALL_LABELS = [str(i) for i in range(1, 9)]
FINNISH_PROMPT_TEMPLATE_TRAIN = "Luokittele t채m채n suomenkielisen lauseen tunne: {text}\nTunne: {label}"
FINNISH_PROMPT_TEMPLATE_PRED = "Luokittele t채m채n suomenkielisen lauseen tunne: {text}\nTunne:"

MAX_INPUT_LENGTH = 256
MAX_NEW_TOKENS = 10
EVAL_SAMPLES_COUNT_CV = 300 
FINAL_EPOCHS = 5
CV_EPOCHS = 2
BATCH_SIZE = 4
LEARNING_RATE = 2e-4
RANDOM_STATE = 42

R_VALUES = [4, 8]
ALPHA_VALUES = [16, 32]
DROPOUT_VALUES = [0.05, 0.1]
TARGET_MODULES = ["self_attention.query_key_value", "self_attention.dense"]


def load_data(lang: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Loads the training and testing datasets for the specified language."""
    try:
        train_df = pd.read_csv(os.path.join(BASE_PATH, f"train_{lang}.csv"))
        test_df = pd.read_csv(os.path.join(BASE_PATH, f"test_{lang}.csv"))
        return train_df, test_df
    except FileNotFoundError:
        print(f"Error: Data files not found in {BASE_PATH}. Please check path.")
        raise


def preprocess_function(examples: Dict[str, List[Any]], tokenizer: AutoTokenizer) -> Dict[str, Any]:
    """Creates tokenized training samples with the Finnish prompt."""
    texts = [
        FINNISH_PROMPT_TEMPLATE_TRAIN.format(text=t, label=l)
        for t, l in zip(examples["text"], examples["labels"])
    ]
    return tokenizer(
        texts,
        truncation=True,
        max_length=MAX_INPUT_LENGTH,
        padding="max_length"
    )


def setup_model_and_datasets(model_name: str, train_df: pd.DataFrame, test_df: pd.DataFrame) -> Tuple[AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling, Dataset, Dataset]:
    """Loads model, tokenizer, and prepares tokenized datasets."""

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    base_model = AutoModelForCausalLM.from_pretrained(model_name)
    base_model.resize_token_embeddings(len(tokenizer))

    num_proc = os.cpu_count() or 1
    
    preprocess_func = lambda x: preprocess_function(x, tokenizer)
    
    tokenized_train = Dataset.from_pandas(train_df).map(
        preprocess_func, batched=True, remove_columns=train_df.columns.tolist(), num_proc=num_proc
    )
    tokenized_test = Dataset.from_pandas(test_df).map(
        preprocess_func, batched=True, remove_columns=test_df.columns.tolist(), num_proc=num_proc
    )

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    return base_model, tokenizer, data_collator, tokenized_train, tokenized_test


def predict_emotions_gpt2(
    model: PeftModel,
    tokenizer: AutoTokenizer,
    df: pd.DataFrame,
    max_samples: Optional[int] = None
) -> Tuple[List[str], List[str]]:
    """Generate predictions for emotion classification using the Finnish prompt."""

    preds: List[str] = []
    golds: List[str] = []

    df_to_process = df if max_samples is None else df.head(max_samples)
    total_samples = len(df_to_process)

    device = model.device
    model.eval()

    for _, row in tqdm(df_to_process.iterrows(), total=total_samples, desc="Generating Predictions"):
        prompt = FINNISH_PROMPT_TEMPLATE_PRED.format(text=row['text'])
        inputs = tokenizer(prompt, return_tensors="pt").to(device)

        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=MAX_NEW_TOKENS,
                pad_token_id=tokenizer.eos_token_id,
                do_sample=False,
                temperature=1.0
            )

        input_length = inputs['input_ids'].shape[1]
        pred_text = tokenizer.decode(output[0][input_length:], skip_special_tokens=True).lower().strip()

        preds.append(pred_text)
        golds.append(str(row["labels"]))

    return preds, golds


def compute_metrics_numeric(preds: List[str], golds: List[str]) -> Dict[str, float]:
    """Computes multi-label classification metrics (micro/macro F1, Jaccard, Hamming)."""

    y_true = np.zeros((len(golds), len(ALL_LABELS)))
    y_pred = np.zeros((len(golds), len(ALL_LABELS)))

    for i, (g, p) in enumerate(zip(golds, preds)):
        true_ids = [s.strip() for s in str(g).split(",") if s.strip().isdigit()]
        pred_ids = [s.strip() for s in re.findall(r'\b\d\b', str(p)) if s.strip().isdigit()]

        for t in true_ids:
            if t in ALL_LABELS:
                y_true[i, ALL_LABELS.index(t)] = 1
        for t in pred_ids:
            if t in ALL_LABELS:
                y_pred[i, ALL_LABELS.index(t)] = 1

    metrics = {
        "micro_f1": f1_score(y_true, y_pred, average="micro", zero_division=0),
        "macro_f1": f1_score(y_true, y_pred, average="macro", zero_division=0),
        "jaccard": jaccard_score(y_true, y_pred, average="samples", zero_division=0),
        "hamming": hamming_loss(y_true, y_pred),
    }
    return metrics


# --- MAIN EXECUTION BLOCKS ---

def run_lora_grid_search(
    tokenizer: AutoTokenizer,
    data_collator: DataCollatorForLanguageModeling,
    tokenized_train: Dataset,
    tokenized_test: Dataset,
    test_df: pd.DataFrame,
    param_grid: List[Dict[str, float]]
) -> Dict[str, float]:
    """
    Runs all grid search combinations and saves results.
    Returns the parameters of the best performing model based on micro_f1.
    """
    print("\n" + "="*50)
    print(f"1) STARTING LORA HYPERPARAMETER GRID SEARCH ({len(param_grid)} runs)")
    print("="*50)

    grid_results = []
    best_f1 = -1.0
    best_params = {}

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    for params in param_grid:
        r_val, alpha_val, dropout_val = params["r"], params["lora_alpha"], params["lora_dropout"]
        print(f"\n--- Training with LoRA params: r={r_val}, alpha={alpha_val}, dropout={dropout_val} ---")

        base_model_run = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)
        base_model_run.resize_token_embeddings(len(tokenizer))

        lora_config = LoraConfig(
            r=r_val, lora_alpha=alpha_val, target_modules=TARGET_MODULES,
            lora_dropout=dropout_val, bias="none", task_type="CAUSAL_LM"
        )
        model = get_peft_model(base_model_run, lora_config)

        training_args = TrainingArguments(
            output_dir=os.path.join(OUTPUT_ROOT_DIR, f"results_lora_finnish_gpt3_r{r_val}_a{alpha_val}"),
            per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE,
            learning_rate=LEARNING_RATE, num_train_epochs=CV_EPOCHS,
            eval_strategy="epoch", save_strategy="no", logging_dir=LOG_ROOT_DIR,
            report_to="none", fp16=torch.cuda.is_available(), seed=RANDOM_STATE,
        )

        trainer = Trainer(
            model=model, args=training_args, train_dataset=tokenized_train,
            eval_dataset=tokenized_test, tokenizer=tokenizer, data_collator=data_collator,
        )

        trainer.train()

        preds, golds = predict_emotions_gpt2(model, tokenizer, test_df, max_samples=EVAL_SAMPLES_COUNT_CV)
        metrics = compute_metrics_numeric(preds, golds)
        metrics.update(params)
        grid_results.append(metrics)

        print(f" Results for r={r_val}, alpha={alpha_val}: {metrics}")

        save_dir = os.path.join(WEIGHTS_ROOT_DIR, f"lora_finnish_gpt3_r{r_val}_a{alpha_val}")
        model.save_pretrained(save_dir)
        tokenizer.save_pretrained(save_dir)

        if metrics['micro_f1'] > best_f1:
            best_f1 = metrics['micro_f1']
            best_params = params

        del model, base_model_run, trainer
        torch.cuda.empty_cache()
        gc.collect()

    results_df = pd.DataFrame(grid_results)
    results_df.to_csv(RESULTS_FILE, index=False)

    print("\n All experiments complete. Summary:")
    print(results_df.sort_values("micro_f1", ascending=False).to_markdown(index=False))

    return best_params


def run_final_fine_tuning(
    best_params: Dict[str, float],
    tokenizer: AutoTokenizer,
    data_collator: DataCollatorForLanguageModeling,
    tokenized_train: Dataset,
    tokenized_test: Dataset,
    final_epochs: int,
    test_df: pd.DataFrame
):
    """
    Performs final, longer fine-tuning using the best LoRA parameters.
    Evaluates the final model on the whole test dataset.
    """
    print("\n" + "="*50)
    print(f"2) STARTING FINAL FINE-TUNING ({final_epochs} epochs) WITH BEST PARAMS")
    print(f"   Best Params: R={best_params['r']}, Alpha={best_params['lora_alpha']}, Dropout={best_params['lora_dropout']}")
    print("="*50)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)
    base_model.resize_token_embeddings(len(tokenizer))

    lora_config = LoraConfig(
        r=best_params["r"], lora_alpha=best_params["lora_alpha"], target_modules=TARGET_MODULES,
        lora_dropout=best_params["lora_dropout"], bias="none", task_type="CAUSAL_LM"
    )
    model = get_peft_model(base_model, lora_config)
    model.print_trainable_parameters()

    training_args = TrainingArguments(
        output_dir=os.path.join(OUTPUT_ROOT_DIR, "results_lora_finnish_gpt3_best_model"),
        per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE,
        learning_rate=2e-5,
        num_train_epochs=final_epochs,
        eval_strategy="epoch", save_strategy="epoch", save_total_limit=2,
        load_best_model_at_end=True, metric_for_best_model="eval_loss", greater_is_better=False,
        logging_dir=os.path.join(LOG_ROOT_DIR, "final"),
        report_to="none", fp16=torch.cuda.is_available(), seed=RANDOM_STATE,
    )

    trainer = Trainer(
        model=model, args=training_args, train_dataset=tokenized_train,
        eval_dataset=tokenized_test, tokenizer=tokenizer, data_collator=data_collator,
    )

    trainer.train()

    model.save_pretrained(FINAL_SAVE_DIR)
    tokenizer.save_pretrained(FINAL_SAVE_DIR)
    print(f"\nFinal fine-tuned model saved to {FINAL_SAVE_DIR}")

    print("\n" + "="*50)
    print(f"3) EVALUATION OF BEST MODEL ON FULL TEST SET ({len(test_df)} samples)")
    print("="*50)

    del model, base_model, trainer
    torch.cuda.empty_cache()
    gc.collect()

    base_model_eval = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)
    final_model = PeftModel.from_pretrained(base_model_eval, FINAL_SAVE_DIR)
    final_model = final_model.merge_and_unload()
    final_model.eval()

    preds, golds = predict_emotions_gpt2(final_model, tokenizer, test_df, max_samples=None)
    final_metrics = compute_metrics_numeric(preds, golds)

    print("\n=============================================")
    print(f"FINAL EVALUATION RESULTS ({MODEL_NAME} + LoRA, {FINAL_EPOCHS} Epochs)")
    print("=============================================")
    for k, v in final_metrics.items():
        print(f"{k}: {v:.4f}")
    print("=============================================")

    del final_model, base_model_eval
    torch.cuda.empty_cache()
    gc.collect()

# --- MAIN ORCHESTRATOR ---

def main():

    os.makedirs(WEIGHTS_ROOT_DIR, exist_ok=True)
    os.makedirs(OUTPUT_ROOT_DIR, exist_ok=True)
    os.makedirs(LOG_ROOT_DIR, exist_ok=True)

    try:
        train_df, test_df = load_data(LANGUAGE)
    except FileNotFoundError:
        return

    _, tokenizer, data_collator, tokenized_train, tokenized_test = \
        setup_model_and_datasets(MODEL_NAME, train_df, test_df)

    param_grid = [
        {"r": r, "lora_alpha": alpha, "lora_dropout": drop}
        for r, alpha, drop in product(R_VALUES, ALPHA_VALUES, DROPOUT_VALUES)
    ]

    best_params = run_lora_grid_search(
        tokenizer, data_collator, tokenized_train, tokenized_test, test_df, param_grid
    )

    run_final_fine_tuning(
        best_params, tokenizer, data_collator, tokenized_train, tokenized_test, FINAL_EPOCHS, test_df
    )

if __name__ == "__main__":
    main()