# Curation and creation of data for LLM finetuning

## 0. Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import sys
import json

# Add the project root to the Python path to import the modules
project_root = Path().absolute().parent
sys.path.append(str(project_root))

## 1. LLaMA Parameter Optmisation

*Question 1: Which model does the training save? Which performance is it based off of?*

`load_best_model_at_end=True,` combined with `eval_strategy="epoch", save_strategy="epoch", save_total_limit=1,` means that 

- The model is evaluated at the end of each epoch.
- Only the best model according to the default metric is retained at the end (save_total_limit=1 prevents clutter).
- `Trainer` will automatically reload the best-performing checkpoint at the end based on the evaluation loss (by default).

So **the model saved is the one with the lowest validation loss at the end of its epoch**.


*Question 2: Which training/LoRA parameters can be explored to improve performance?*

There are two optimisation targets:
- LoRA configuration
- Training hyperparameters

(A) LoRA parameters (from LoraConfig) include
- r (e.g. 4 ro 32)
- lora_alpha (e.g. 8 to 64)
- target modules (e.g q_proj, k_proj, v_proj, o_proj, but also gate_proj, down_proj, up_proj)
- lora_dropout (0, or 0.05, 0.1 if overfitting)

(B) Training hyperparameters (from TrainingArguments)
- learning_rate (e.g. 9e-5 to 2e-4)
- per_device_train_batch_size (e.g. 4 to 16)
- num_train_epochs
- warmup_ratio
- lr_scheduler_type
- weight_decay
- gradient_accumulation_steps

In [5]:
import re
import pandas as pd
from pathlib import Path

# Paths
log_path = "../llama_search_runs/search_progress_log.txt"
output_dir = Path("../results/latex_tables")
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "top5_lora_search.tex"

# Read log
with open(log_path, "r") as f:
    log_lines = f.readlines()

entries = []
for line in log_lines:
    run_match = re.search(r"run_\d+_r(\d+)_alpha(\d+)_drop([0-9.]+)_lr([\deE.-]+)_bs(\d+)", line)
    f1_match = re.search(r"macro_f1: ([0-9.]+)", line)
    loss_match = re.search(r"val_loss: ([0-9.]+)", line)
    if run_match and f1_match and loss_match:
        r, alpha, drop, lr, bs = run_match.groups()
        entries.append({
            "LoRA rank ($r$)": int(r),
            "LoRA alpha": int(alpha),
            "LoRA dropout": float(drop),
            "Learning rate": lr,
            "Batch size": int(bs),
            "Macro-F1": float(f1_match.group(1)),
            "Val loss": float(loss_match.group(1))
        })

# DataFrame and top 5
df = pd.DataFrame(entries)
top5 = df.sort_values("Macro-F1", ascending=False).head(5).reset_index(drop=True)

# Round values
top5["LoRA dropout"] = top5["LoRA dropout"].round(2)
top5["Macro-F1"] = top5["Macro-F1"].round(2)
top5["Val loss"] = top5["Val loss"].round(2)

# Convert all values to string (so LaTeX formatting is valid)
top5 = top5.astype(str)

# Identify the best row within top5
best_idx = top5["Macro-F1"].astype(float).idxmax()

# Bold the best row
top5.loc[best_idx] = top5.loc[best_idx].apply(lambda x: f"\\textbf{{{x}}}")

# Generate LaTeX
latex_table = top5.to_latex(index=False, escape=False)

# Wrap in resizebox with caption and label
wrapped_latex = f"""
\\centering
\\resizebox{{\\linewidth}}{{!}}{{%
{latex_table}
}}
"""

# Save LaTeX file
with open(output_file, "w") as f:
    f.write(wrapped_latex)

print(f"✅ LaTeX table with bolded best row saved to {output_file}")

✅ LaTeX table with bolded best row saved to ../results/latex_tables/top5_lora_search.tex


## 2. RoBERTa Parameter Optimisation