In [None]:
import gc
import os
import re
from itertools import product
from typing import Any, Dict, List, Optional, Tuple

import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from peft import LoraConfig, PeftModel, get_peft_model
from sklearn.metrics import f1_score, hamming_loss, jaccard_score
from tqdm import tqdm
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

BASE_PATH = "XED/processed"
OUTPUT_ROOT_DIR = "./es_stablelm_results"
LOG_ROOT_DIR = "./logs_stablelm_es"
WEIGHTS_ROOT_DIR = "./weights/stablelm_es"
FINAL_SAVE_DIR = os.path.join(WEIGHTS_ROOT_DIR, "lora_stablelm_es_best_final")
RESULTS_FILE = "./results_lora_stablelm_es_experiments.csv"

MODEL_NAME = "stabilityai/stablelm-2-zephyr-1_6b"
LANGUAGE = "es"
ALL_LABELS = [str(i) for i in range(1, 9)]
SPANISH_PROMPT_TEMPLATE_TRAIN = (
    "Clasifica la emoción en esta frase en español: {text}\nEmoción: {label}"
)
SPANISH_PROMPT_TEMPLATE_PRED = (
    "Clasifica la emoción en esta frase en español: {text}\nEmoción:"
)

MAX_INPUT_LENGTH = 256
MAX_NEW_TOKENS = 10
EVAL_SAMPLES_COUNT_CV = 300
FINAL_EPOCHS = 5
CV_EPOCHS = 2
BATCH_SIZE = 4
LEARNING_RATE = 1e-4
RANDOM_STATE = 42

R_VALUES = [4, 8]
ALPHA_VALUES = [16, 32]
DROPOUT_VALUES = [0.05, 0.1]
TARGET_MODULES = [
    "q_proj", "k_proj", "v_proj", "o_proj",
    "up_proj", "down_proj", "gate_proj"
]


def load_data(lang: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    try:
        train_df = pd.read_csv(os.path.join(BASE_PATH, f"train_{lang}.csv"))
        test_df = pd.read_csv(os.path.join(BASE_PATH, f"test_{lang}.csv"))
        return train_df, test_df
    except FileNotFoundError:
        print(f"Error: Data files not found in {BASE_PATH}. Please check path.")
        raise


def preprocess_function(
    examples: Dict[str, List[Any]], tokenizer: AutoTokenizer
) -> Dict[str, Any]:
    texts = [
        SPANISH_PROMPT_TEMPLATE_TRAIN.format(text=t, label=l)
        for t, l in zip(examples["text"], examples["labels"])
    ]
    return tokenizer(
        texts,
        truncation=True,
        max_length=MAX_INPUT_LENGTH,
        padding="max_length"
    )


def setup_model_and_datasets(
    model_name: str, train_df: pd.DataFrame, test_df: pd.DataFrame
) -> Tuple[
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    Dataset,
    Dataset
]:
    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    config = AutoConfig.from_pretrained(model_name)

    base_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        config=config,
        torch_dtype=(
            torch.float16 if torch.cuda.is_available() else torch.float32
        ),
        device_map="auto"
    )

    base_model.config.pad_token_id = tokenizer.eos_token_id
    base_model.config.use_cache = False
    base_model.resize_token_embeddings(len(tokenizer))

    num_proc = os.cpu_count() or 1
    preprocess_func = lambda x: preprocess_function(x, tokenizer)

    tokenized_train = Dataset.from_pandas(train_df).map(
        preprocess_func,
        batched=True,
        remove_columns=train_df.columns.tolist(),
        num_proc=num_proc
    )
    tokenized_test = Dataset.from_pandas(test_df).map(
        preprocess_func,
        batched=True,
        remove_columns=test_df.columns.tolist(),
        num_proc=num_proc
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False
    )

    return base_model, tokenizer, data_collator, tokenized_train, tokenized_test


def predict_emotions_spanish(
    model: PeftModel,
    tokenizer: AutoTokenizer,
    df: pd.DataFrame,
    max_samples: Optional[int] = None
) -> Tuple[List[str], List[str]]:
    preds: List[str] = []
    golds: List[str] = []

    df_to_process = df if max_samples is None else df.head(max_samples)
    total_samples = len(df_to_process)

    input_device = next(model.parameters()).device
    model.eval()

    for _, row in tqdm(
        df_to_process.iterrows(),
        total=total_samples,
        desc="Generating Predictions"
    ):
        prompt = SPANISH_PROMPT_TEMPLATE_PRED.format(text=row['text'])
        inputs = tokenizer(prompt, return_tensors="pt").to(input_device)

        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=MAX_NEW_TOKENS,
                pad_token_id=tokenizer.eos_token_id,
                do_sample=False,
                temperature=1.0
            )

        input_length = inputs['input_ids'].shape[1]
        pred_text = tokenizer.decode(
            output[0][input_length:], skip_special_tokens=True
        ).lower().strip()

        preds.append(pred_text)
        golds.append(str(row["labels"]))

    return preds, golds


def compute_metrics_numeric(
    preds: List[str], golds: List[str]
) -> Dict[str, float]:
    y_true = np.zeros((len(golds), len(ALL_LABELS)))
    y_pred = np.zeros((len(golds), len(ALL_LABELS)))

    for i, (g, p) in enumerate(zip(golds, preds)):
        true_ids = [s.strip() for s in str(g).split(",") if s.strip().isdigit()]
        pred_ids = [
            s.strip() for s in re.findall(r'\b\d\b', str(p)) if s.strip().isdigit()
        ]

        for t in true_ids:
            if t in ALL_LABELS:
                y_true[i, ALL_LABELS.index(t)] = 1
        for t in pred_ids:
            if t in ALL_LABELS:
                y_pred[i, ALL_LABELS.index(t)] = 1

    metrics = {
        "micro_f1": f1_score(
            y_true, y_pred, average="micro", zero_division=0
        ),
        "macro_f1": f1_score(
            y_true, y_pred, average="macro", zero_division=0
        ),
        "jaccard": jaccard_score(
            y_true, y_pred, average="samples", zero_division=0
        ),
        "hamming": hamming_loss(y_true, y_pred),
    }
    return metrics


def run_lora_grid_search(
    tokenizer: AutoTokenizer,
    data_collator: DataCollatorForLanguageModeling,
    tokenized_train: Dataset,
    tokenized_test: Dataset,
    test_df: pd.DataFrame,
    param_grid: List[Dict[str, float]]
) -> Dict[str, float]:
    print("\n" + "=" * 50)
    print(f"1) STARTING LORA HYPERPARAMETER GRID SEARCH ({len(param_grid)} runs)")
    print("=" * 50)

    grid_results = []
    best_f1 = -1.0
    best_params = {}

    for params in param_grid:
        r_val = params["r"]
        alpha_val = params["lora_alpha"]
        dropout_val = params["lora_dropout"]
        print(
            f"\n=== Training with LoRA params: r={r_val}, "
            f"alpha={alpha_val}, dropout={dropout_val} ==="
        )

        base_model_run = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            torch_dtype=(
                torch.float16 if torch.cuda.is_available() else torch.float32
            ),
            device_map="auto"
        )
        base_model_run.config.pad_token_id = tokenizer.eos_token_id
        base_model_run.config.use_cache = False
        base_model_run.resize_token_embeddings(len(tokenizer))

        lora_config = LoraConfig(
            r=r_val,
            lora_alpha=alpha_val,
            target_modules=TARGET_MODULES,
            lora_dropout=dropout_val,
            bias="none",
            task_type="CAUSAL_LM"
        )
        model = get_peft_model(base_model_run, lora_config)
        model.gradient_checkpointing_enable()
        model.enable_input_require_grads()

        output_dir = os.path.join(
            OUTPUT_ROOT_DIR,
            f"results_lora_stablelm_r{r_val}_a{alpha_val}_d{dropout_val}"
        )
        training_args = TrainingArguments(
            output_dir=output_dir,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            learning_rate=LEARNING_RATE,
            num_train_epochs=CV_EPOCHS,
            eval_strategy="epoch",
            save_strategy="no",
            logging_dir=LOG_ROOT_DIR,
            report_to="none",
            fp16=False,
            bf16=torch.cuda.is_available(),
            seed=RANDOM_STATE,
            gradient_checkpointing=True,
            max_grad_norm=1.0,
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_test,
            tokenizer=tokenizer,
            data_collator=data_collator,
        )

        trainer.train()

        preds, golds = predict_emotions_spanish(
            model, tokenizer, test_df, max_samples=EVAL_SAMPLES_COUNT_CV
        )
        metrics = compute_metrics_numeric(preds, golds)
        metrics.update(params)
        grid_results.append(metrics)

        print(
            f"Results for r={r_val}, alpha={alpha_val}, "
            f"dropout={dropout_val}: {metrics}"
        )

        save_dir = os.path.join(
            WEIGHTS_ROOT_DIR, f"lora_r{r_val}_a{alpha_val}_d{dropout_val}"
        )
        os.makedirs(save_dir, exist_ok=True)
        model.save_pretrained(save_dir)
        tokenizer.save_pretrained(save_dir)

        if metrics['micro_f1'] > best_f1:
            best_f1 = metrics['micro_f1']
            best_params = params

        del model, base_model_run, trainer
        torch.cuda.empty_cache()
        gc.collect()

    results_df = pd.DataFrame(grid_results)
    results_df.to_csv(RESULTS_FILE, index=False)

    print("\n All experiments complete. Summary:")
    print(
        results_df.sort_values(
            "micro_f1", ascending=False
        ).to_markdown(index=False)
    )

    return best_params


def run_final_fine_tuning(
    best_params: Dict[str, float],
    tokenizer: AutoTokenizer,
    data_collator: DataCollatorForLanguageModeling,
    tokenized_train: Dataset,
    tokenized_test: Dataset,
    final_epochs: int,
    test_df: pd.DataFrame
):
    print("\n" + "=" * 50)
    print(f"2) STARTING FINAL FINE-TUNING ({final_epochs} epochs) WITH BEST PARAMS")
    if best_params:
        print(
            f"   Best Params: R={best_params.get('r')}, "
            f"Alpha={best_params.get('lora_alpha')}, "
            f"Dropout={best_params.get('lora_dropout')}"
        )
    else:
        print("   Warning: No best parameters found from grid search. Using default.")
        best_params = {"r": 8, "lora_alpha": 16, "lora_dropout": 0.05}
    print("=" * 50)

    base_model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=(
            torch.float16 if torch.cuda.is_available() else torch.float32
        ),
        device_map="auto"
    )
    base_model.config.pad_token_id = tokenizer.eos_token_id
    base_model.config.use_cache = False
    base_model.resize_token_embeddings(len(tokenizer))

    lora_config = LoraConfig(
        r=best_params["r"],
        lora_alpha=best_params["lora_alpha"],
        target_modules=TARGET_MODULES,
        lora_dropout=best_params["lora_dropout"],
        bias="none",
        task_type="CAUSAL_LM"
    )
    model = get_peft_model(base_model, lora_config)
    model.gradient_checkpointing_enable()
    model.enable_input_require_grads()
    model.print_trainable_parameters()

    output_dir = os.path.join(
        OUTPUT_ROOT_DIR, "results_lora_stablelm_best_final"
    )
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        learning_rate=LEARNING_RATE,
        num_train_epochs=final_epochs,
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        logging_dir=os.path.join(LOG_ROOT_DIR, "final"),
        report_to="none",
        fp16=False,
        bf16=torch.cuda.is_available(),
        seed=RANDOM_STATE,
        gradient_checkpointing=True,
        max_grad_norm=1.0,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    trainer.train()

    model.save_pretrained(FINAL_SAVE_DIR)
    tokenizer.save_pretrained(FINAL_SAVE_DIR)
    print(f"\nFinal fine-tuned model saved to {FINAL_SAVE_DIR}")

    print("\n" + "=" * 50)
    print(f"3) EVALUATION OF BEST MODEL ON FULL TEST SET ({len(test_df)} samples)")
    print("=" * 50)

    del model, base_model, trainer
    torch.cuda.empty_cache()
    gc.collect()

    base_model_eval = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=(
            torch.float16 if torch.cuda.is_available() else torch.float32
        ),
        device_map="auto"
    )
    base_model_eval.config.pad_token_id = tokenizer.eos_token_id
    base_model_eval.resize_token_embeddings(len(tokenizer))

    final_model = PeftModel.from_pretrained(base_model_eval, FINAL_SAVE_DIR)
    final_model = final_model.merge_and_unload()
    final_model.eval()

    preds, golds = predict_emotions_spanish(
        final_model, tokenizer, test_df, max_samples=None
    )
    final_metrics = compute_metrics_numeric(preds, golds)

    print("\n=============================================")
    print(
        f"FINAL EVALUATION RESULTS ({MODEL_NAME} + LoRA, {FINAL_EPOCHS} Epochs)"
    )
    print("=============================================")
    for k, v in final_metrics.items():
        print(f"{k}: {v:.4f}")
    print("=============================================")

    del final_model, base_model_eval
    torch.cuda.empty_cache()
    gc.collect()


def main():
    os.makedirs(WEIGHTS_ROOT_DIR, exist_ok=True)
    os.makedirs(OUTPUT_ROOT_DIR, exist_ok=True)
    os.makedirs(LOG_ROOT_DIR, exist_ok=True)

    try:
        train_df, test_df = load_data(LANGUAGE)
    except FileNotFoundError:
        return

    base_model, tokenizer, data_collator, tokenized_train, tokenized_test = (
        setup_model_and_datasets(MODEL_NAME, train_df, test_df)
    )
    
    del base_model
    torch.cuda.empty_cache()
    gc.collect()

    param_grid = [
        {"r": r, "lora_alpha": alpha, "lora_dropout": drop}
        for r, alpha, drop in product(
            R_VALUES, ALPHA_VALUES, DROPOUT_VALUES
        )
    ]

    best_params = run_lora_grid_search(
        tokenizer, data_collator, tokenized_train,
        tokenized_test, test_df, param_grid
    )

    run_final_fine_tuning(
        best_params, tokenizer, data_collator, tokenized_train,
        tokenized_test, FINAL_EPOCHS, test_df
    )


if __name__ == "__main__":
    main()