# Proyecto Final: Comparaci√≥n de M√©todos de Fine-tuning

**Autor:** [Tu Nombre]  
**Fecha:** Diciembre 2025  
**Curso:** [Nombre del Curso]  

---

## Resumen Ejecutivo

Este proyecto eval√∫a diferentes m√©todos de fine-tuning para modelos de lenguaje, comparando:
- **Fine-tuning completo** (base_qa)
- **LoRA** (Low-Rank Adaptation)
- **QLoRA** (Quantized LoRA)
- **DoRA** (Weight-Decomposed Low-Rank Adaptation)
- **QDoRA** (Quantized DoRA)

Se analizaron m√∫ltiples configuraciones variando:
- N√∫mero de √©pocas (2 y 3)
- Learning rates (2e-4, 4.25e-5)

Los resultados se evaluaron usando las m√©tricas F1-Score y Exact Match.

## 1. Configuraci√≥n del Entorno

In [1]:
# Importar librer√≠as necesarias
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator

# Configuraci√≥n de visualizaci√≥n
plt.rcParams.update({
    "font.size": 11,
    "figure.figsize": (10, 6),
    "axes.grid": True,
    "grid.alpha": 0.3
})
sns.set_palette("husl")

# Configuraci√≥n de pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 3)

## 2. Funciones de Utilidad (CORREGIDAS)

In [2]:
def find_experiment_folder(base_path, keyword):
    """
    Busca la carpeta de un experimento por palabra clave.
    B√∫squeda flexible: 'lora' encuentra 'lora-epoch-2_lr_2e-4'
    """
    if base_path is None or not os.path.exists(base_path):
        return None
    
    # Intentar coincidencia exacta primero
    exact_path = os.path.join(base_path, keyword)
    if os.path.exists(exact_path):
        return exact_path
    
    # Buscar por palabra clave (case-insensitive)
    for d in os.listdir(base_path):
        if keyword.lower() in d.lower():
            return os.path.join(base_path, d)
    
    return None


def load_metrics_from_trainer_state(exp_path):
    """
    Carga m√©tricas desde trainer_state.json.
    Busca en el √∫ltimo checkpoint.
    """
    if exp_path is None or not os.path.exists(exp_path):
        return None
    
    # Buscar checkpoints
    checkpoints = [
        os.path.join(exp_path, d)
        for d in os.listdir(exp_path)
        if d.startswith("checkpoint")
    ]
    
    if not checkpoints:
        # Intentar en la ra√≠z
        trainer_state = os.path.join(exp_path, "trainer_state.json")
        if not os.path.exists(trainer_state):
            return None
    else:
        # Usar √∫ltimo checkpoint
        ckpt = sorted(checkpoints)[-1]
        trainer_state = os.path.join(ckpt, "trainer_state.json")
        if not os.path.exists(trainer_state):
            return None
    
    try:
        with open(trainer_state) as f:
            state = json.load(f)
        
        eval_logs = [l for l in state["log_history"] if "eval_f1" in l]
        
        if not eval_logs:
            return None
        
        last = eval_logs[-1]
        return {
            "F1": last["eval_f1"],
            "EM": last["eval_exact_match"]
        }
    except:
        return None


def training_time_from_tensorboard(exp_path):
    """
    Calcula tiempo desde TensorBoard.
    CORREGIDO: Usa rutas absolutas.
    """
    if exp_path is None or not os.path.exists(exp_path):
        return None
    
    event_files = []
    for root, _, files in os.walk(exp_path):
        for f in files:
            if "tfevents" in f:
                event_files.append(root)
                break
    
    if not event_files:
        return None
    
    try:
        # CLAVE: Usar ruta absoluta
        event_dir_abs = os.path.abspath(event_files[0])
        ea = EventAccumulator(event_dir_abs)
        ea.Reload()
        
        times = []
        for tag in ea.Tags().get("scalars", []):
            for e in ea.Scalars(tag):
                times.append(e.wall_time)
        
        if len(times) < 2:
            return None
        
        return max(times) - min(times)
    except:
        return None


def training_time_from_files(exp_path):
    """
    Calcula tiempo desde timestamps de archivos.
    Fallback cuando TensorBoard no est√° disponible.
    """
    if exp_path is None or not os.path.exists(exp_path):
        return None
    
    checkpoints = [
        os.path.join(exp_path, d)
        for d in os.listdir(exp_path)
        if d.startswith("checkpoint")
    ]
    
    if not checkpoints:
        return None
    
    ckpt = sorted(checkpoints)[-1]
    times = []
    
    for root, _, files in os.walk(ckpt):
        for f in files:
            try:
                times.append(os.path.getmtime(os.path.join(root, f)))
            except:
                continue
    
    if len(times) < 2:
        return None
    
    return max(times) - min(times)


def training_time_hybrid(exp_path):
    """
    Funci√≥n h√≠brida: intenta TensorBoard primero, luego files.
    RECOMENDADO: Esta es la m√°s robusta.
    """
    time_tb = training_time_from_tensorboard(exp_path)
    if time_tb is not None:
        return time_tb
    return training_time_from_files(exp_path)

## 3. Configuraci√≥n de Ruta y Experimentos

In [3]:
# RUTA CORRECTA (ajustada seg√∫n tu sistema)
BASE_RESULTS = "/data/nina/qa_squad/qa_bertimbau/bertimbau_base/results"

# Verificar que existe
print(f"Ruta configurada: {BASE_RESULTS}")
print(f"¬øExiste?: {os.path.exists(BASE_RESULTS)}")

if os.path.exists(BASE_RESULTS):
    exp_count = len([d for d in os.listdir(BASE_RESULTS) if os.path.isdir(os.path.join(BASE_RESULTS, d))])
    print(f"‚úì Experimentos disponibles: {exp_count}")
else:
    print(" ERROR: La ruta no existe. Ajusta BASE_RESULTS.")

Ruta configurada: /data/nina/qa_squad/qa_bertimbau/bertimbau_base/results
¬øExiste?: True
‚úì Experimentos disponibles: 18


## 4. Definici√≥n de Experimentos (NOMBRES EXACTOS)

In [None]:
# Diccionario con TODOS los experimentos
# Nombres verificados seg√∫n la estructura de tu carpeta

experiments = {
    # Base QA (2 epochs)
    "Base QA - 2 epochs (lr=2e-4)": "2_epochs_base_qa_2e-4",
    "Base QA - 2 epochs (lr=4.25e-5)": "2_epochs_base_qa_4.25e-05",
    
    # Base QA (3 epochs)
    "Base QA - 3 epochs (lr=2e-4)": "3_epochs_base_qa_2e-4",
    "Base QA - 3 epochs (lr=4.25e-5)": "3_epochs_base_qa_4.25e-05",
    
    # DoRA (nombres exactos)
    "DoRA - 2 epochs (lr=2e-4)": "dora_epoch-2_lr_2e-4",
    "DoRA - 2 epochs (lr=4.25e-5)": "dora_epoch-2_lr_4.25e-05",
    "DoRA - 3 epochs (lr=2e-4)": "dora_epoch-3_lr_2e-4",
    "DoRA - 3 epochs (lr=4.25e-5)": "dora_epoch-3_lr_4.25e-05",
    
    # QDoRA (nombres exactos)
    "QDoRA - 2 epochs (lr=2e-4)": "qdora_epoch-2_lr_2e-4",
    "QDoRA - 2 epochs (lr=4.25e-5)": "qdora_epoch-2_lr_4.25e-05",
    "QDoRA - 3 epochs (lr=2e-4)": "qdora_epoch-3_lr_2e-4",
    "QDoRA - 3 epochs (lr=4.25e-5)": "qdora_epoch-3_lr_4.25e-05",
    
    # LoRA epochs
    "LoRA - 2 epochs (lr=2e-4)": "lora_epoch-2_lr_2e-4",
    "LoRA - 2 epochs (lr=4.25e-5)": "lora_epoch-2_lr_4.25e-05",
    "LoRA - 3 epochs (lr=2e-4)": "lora_epoch-3_lr_2e-4",
    "LoRA - 3 epochs (lr=4.25e-5)": "lora_epoch-3_lr_4.25e-05",
    
    # QLoRA
    "QLoRA - 2 epochs (lr=2e-4)": "qlora_epoch-2_lr_2e-4",
    "QLoRA - 2 epochs (lr=4.25e-5)": "qlora_epoch-2_lr_4.25e-05",
    "QLoRA - 3 epochs (lr=2e-4)": "qlora_epoch-3_lr_2e-4",
    "QLoRA - 3 epochs (lr=4.25e-5)": "qlora_epoch-3_lr_4.25e-05"
}

print(f"Total de experimentos definidos: {len(experiments)}")

Total de experimentos definidos: 20


## 5. Recopilaci√≥n de Resultados

In [24]:
# Recopilar m√©tricas de todos los experimentos
rows = []

print("Recopilando datos...\n")
for name, key in experiments.items():
    exp_path = find_experiment_folder(BASE_RESULTS, key)
    
    if exp_path is None:
        print(f"‚úó {name}: No encontrado")
        continue
    
    metrics = load_metrics_from_trainer_state(exp_path)
    
    if metrics is None:
        print(f"‚ö† {name}: Sin m√©tricas")
        continue
    
    # Usar funci√≥n h√≠brida para tiempo
    time_sec = training_time_hybrid(exp_path)
    
    rows.append({
        "Method": name,
        "F1": round(metrics["F1"], 2),
        "Exact Match": round(metrics["EM"], 2),
        "Training Time (min)": round(time_sec / 60, 1) if time_sec else None
    })
    
    print(f"‚úì {name}: F1={metrics['F1']:.2f}, EM={metrics['EM']:.2f}")

df_results = pd.DataFrame(rows)

print(f"\n{'='*80}")
print(f"‚úì Total recopilado: {len(df_results)} de {len(experiments)} experimentos")
print(f"{'='*80}")

Recopilando datos...

‚úì Base QA - 2 epochs (lr=2e-4): F1=79.74, EM=67.15
‚úì Base QA - 2 epochs (lr=4.25e-5): F1=82.79, EM=70.91
‚úì Base QA - 3 epochs (lr=2e-4): F1=78.33, EM=65.54
‚úì Base QA - 3 epochs (lr=4.25e-5): F1=82.18, EM=70.40
‚úì DoRA - 2 epochs (lr=2e-4): F1=78.01, EM=64.89
‚úì DoRA - 2 epochs (lr=4.25e-5): F1=71.36, EM=57.68
‚úì DoRA - 3 epochs (lr=2e-4): F1=78.27, EM=65.08
‚úì DoRA - 3 epochs (lr=4.25e-5): F1=71.50, EM=57.65
‚úì QDoRA - 2 epochs (lr=2e-4): F1=74.41, EM=61.26
‚úì QDoRA - 2 epochs (lr=4.25e-5): F1=54.10, EM=41.15
‚úì QDoRA - 3 epochs (lr=2e-4): F1=74.85, EM=61.58
‚úì QDoRA - 3 epochs (lr=4.25e-5): F1=58.42, EM=45.37
‚úó LoRA - 2 epochs (lr=2e-4): No encontrado
‚úì LoRA - 2 epochs (lr=4.25e-5): F1=71.81, EM=58.07
‚úì LoRA - 3 epochs (lr=2e-4): F1=78.01, EM=65.03
‚úì LoRA - 3 epochs (lr=4.25e-5): F1=72.01, EM=58.32
‚úó QLoRA - 2 epochs (lr=2e-4): No encontrado
‚úì QLoRA - 2 epochs (lr=4.25e-5): F1=53.52, EM=40.54
‚úó QLoRA - 3 epochs (lr=2e-4): No encontra

## 6. Enriquecer Datos con Informaci√≥n Adicional

In [17]:
# Extraer informaci√≥n del nombre
def extract_method(name):
    if " - " in name:
        return name.split(" - ")[0]
    return name.split()[0]

def extract_epochs(name):
    if "2 epochs" in name:
        return 2
    elif "3 epochs" in name:
        return 3
    return None

def extract_lr(name):
    if "2e-4" in name:
        return "2e-4"
    elif "4.25e-5" in name or "4.25e-05" in name:
        return "4.25e-5"
    return None

# Aplicar
df_results['M√©todo'] = df_results['Method'].apply(extract_method)
df_results['√âpocas'] = df_results['Method'].apply(extract_epochs)
df_results['Learning Rate'] = df_results['Method'].apply(extract_lr)

# Renombrar
df_results = df_results.rename(columns={
    'Method': 'Experimento',
    'F1': 'F1-Score',
    'Training Time (min)': 'Tiempo (min)'
})

# Reordenar
column_order = ['Experimento', 'M√©todo', '√âpocas', 'Learning Rate', 
                'F1-Score', 'Exact Match', 'Tiempo (min)']
df_results = df_results[column_order]

print("‚úì Datos enriquecidos")
df_results.head(10)

‚úì Datos enriquecidos


Unnamed: 0,Experimento,M√©todo,√âpocas,Learning Rate,F1-Score,Exact Match,Tiempo (min)
0,Base QA - 2 epochs (lr=2e-4),Base QA,2,0.0002,79.74,67.15,100.0
1,Base QA - 2 epochs (lr=4.25e-5),Base QA,2,4.25e-05,82.79,70.91,100.1
2,Base QA - 3 epochs (lr=2e-4),Base QA,3,0.0002,78.33,65.54,149.1
3,Base QA - 3 epochs (lr=4.25e-5),Base QA,3,4.25e-05,82.18,70.4,148.9
4,DoRA - 2 epochs (lr=2e-4),DoRA,2,0.0002,78.01,64.89,40.4
5,DoRA - 2 epochs (lr=4.25e-5),DoRA,2,4.25e-05,71.36,57.68,40.2
6,DoRA - 3 epochs (lr=2e-4),DoRA,3,0.0002,78.27,65.08,60.0
7,DoRA - 3 epochs (lr=4.25e-5),DoRA,3,4.25e-05,71.5,57.65,54.0
8,QDoRA - 2 epochs (lr=2e-4),QDoRA,2,0.0002,74.41,61.26,42.1
9,QDoRA - 2 epochs (lr=4.25e-5),QDoRA,2,4.25e-05,54.1,41.15,42.2


## 7. Tabla de Resultados Completa

In [18]:
# Ordenar por F1-Score
df_sorted = df_results.sort_values('F1-Score', ascending=False)
df_sorted

Unnamed: 0,Experimento,M√©todo,√âpocas,Learning Rate,F1-Score,Exact Match,Tiempo (min)
1,Base QA - 2 epochs (lr=4.25e-5),Base QA,2,4.25e-05,82.79,70.91,100.1
3,Base QA - 3 epochs (lr=4.25e-5),Base QA,3,4.25e-05,82.18,70.4,148.9
0,Base QA - 2 epochs (lr=2e-4),Base QA,2,0.0002,79.74,67.15,100.0
2,Base QA - 3 epochs (lr=2e-4),Base QA,3,0.0002,78.33,65.54,149.1
6,DoRA - 3 epochs (lr=2e-4),DoRA,3,0.0002,78.27,65.08,60.0
4,DoRA - 2 epochs (lr=2e-4),DoRA,2,0.0002,78.01,64.89,40.4
10,QDoRA - 3 epochs (lr=2e-4),QDoRA,3,0.0002,74.85,61.58,44.6
8,QDoRA - 2 epochs (lr=2e-4),QDoRA,2,0.0002,74.41,61.26,42.1
7,DoRA - 3 epochs (lr=4.25e-5),DoRA,3,4.25e-05,71.5,57.65,54.0
5,DoRA - 2 epochs (lr=4.25e-5),DoRA,2,4.25e-05,71.36,57.68,40.2


## 8. An√°lisis por M√©todo

In [12]:
# Estad√≠sticas por m√©todo
method_stats = df_results.groupby('M√©todo').agg({
    'F1-Score': ['mean', 'std', 'min', 'max', 'count'],
    'Exact Match': ['mean', 'std', 'min', 'max'],
    'Tiempo (min)': ['mean', 'std', 'min', 'max']
}).round(2)

print("Estad√≠sticas por M√©todo:")
method_stats

Estad√≠sticas por M√©todo:


Unnamed: 0_level_0,F1-Score,F1-Score,F1-Score,F1-Score,F1-Score,Exact Match,Exact Match,Exact Match,Exact Match,Tiempo (min),Tiempo (min),Tiempo (min),Tiempo (min)
Unnamed: 0_level_1,mean,std,min,max,count,mean,std,min,max,mean,std,min,max
M√©todo,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Base QA,80.76,2.09,78.33,82.79,4,68.5,2.58,65.54,70.91,124.52,28.26,100.0,149.1
DoRA,74.78,3.88,71.36,78.27,4,61.32,4.23,57.65,65.08,48.65,9.95,40.2,60.0
QDoRA,65.44,10.75,54.1,74.85,4,52.34,10.63,41.15,61.58,45.98,6.13,42.1,55.0
QLoRA,53.36,0.23,53.19,53.52,2,40.17,0.52,39.81,40.54,35.15,7.28,30.0,40.3


In [13]:
# Mejores configuraciones por m√©todo
best_configs = df_results.loc[df_results.groupby('M√©todo')['F1-Score'].idxmax()]
best_configs = best_configs[['M√©todo', '√âpocas', 'Learning Rate', 'F1-Score', 'Exact Match', 'Tiempo (min)']]
best_configs = best_configs.sort_values('F1-Score', ascending=False)

print("\nMejores configuraciones por m√©todo:")
best_configs


Mejores configuraciones por m√©todo:


Unnamed: 0,M√©todo,√âpocas,Learning Rate,F1-Score,Exact Match,Tiempo (min)
1,Base QA,2,4.25e-05,82.79,70.91,100.1
6,DoRA,3,0.0002,78.27,65.08,60.0
10,QDoRA,3,0.0002,74.85,61.58,44.6
12,QLoRA,2,4.25e-05,53.52,40.54,30.0


## 9. Visualizaciones

In [None]:
# Comparaci√≥n de F1-Score
plt.figure(figsize=(12, 8))
df_plot = df_results.sort_values('F1-Score', ascending=True)
plt.barh(range(len(df_plot)), df_plot['F1-Score'], color='steelblue')
plt.yticks(range(len(df_plot)), df_plot['Experimento'], fontsize=9)
plt.xlabel('F1-Score', fontsize=12)
plt.title('Comparaci√≥n de F1-Score entre Todos los Experimentos', fontsize=14, fontweight='bold')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Boxplot por m√©todo
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

df_results.boxplot(column='F1-Score', by='M√©todo', ax=axes[0])
axes[0].set_title('Distribuci√≥n de F1-Score por M√©todo')
axes[0].set_xlabel('M√©todo')
axes[0].set_ylabel('F1-Score')
plt.sca(axes[0])
plt.xticks(rotation=45, ha='right')

df_results.boxplot(column='Exact Match', by='M√©todo', ax=axes[1])
axes[1].set_title('Distribuci√≥n de Exact Match por M√©todo')
axes[1].set_xlabel('M√©todo')
axes[1].set_ylabel('Exact Match')
plt.sca(axes[1])
plt.xticks(rotation=45, ha='right')

plt.suptitle('')
plt.tight_layout()
plt.show()

In [None]:
# F1-Score vs Tiempo
df_with_time = df_results[df_results['Tiempo (min)'].notna()]

if len(df_with_time) > 0:
    plt.figure(figsize=(10, 6))
    
    for method in df_with_time['M√©todo'].unique():
        df_method = df_with_time[df_with_time['M√©todo'] == method]
        plt.scatter(df_method['Tiempo (min)'], df_method['F1-Score'], 
                    label=method, s=100, alpha=0.7)
    
    plt.xlabel('Tiempo de Entrenamiento (minutos)', fontsize=12)
    plt.ylabel('F1-Score', fontsize=12)
    plt.title('Eficiencia: F1-Score vs Tiempo de Entrenamiento', fontsize=14, fontweight='bold')
    plt.legend(loc='best')
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("No hay datos de tiempo suficientes para graficar")

## 10. Top 5 Experimentos

In [None]:
top_5 = df_results.nlargest(5, 'F1-Score')[['Experimento', 'M√©todo', '√âpocas', 
                                               'Learning Rate', 'F1-Score', 
                                               'Exact Match', 'Tiempo (min)']]
print("\nüèÜ Top 5 Experimentos (por F1-Score):")
print("=" * 80)
top_5

## 11. Conclusiones

**[Completa esta secci√≥n despu√©s de analizar los resultados]**

In [None]:
# Exportar resultados
df_results.to_csv('resultados_completos.csv', index=False)
best_configs.to_csv('mejores_configuraciones.csv', index=False)
method_stats.to_csv('estadisticas_por_metodo.csv')

print("‚úì Resultados exportados:")
print("  - resultados_completos.csv")
print("  - mejores_configuraciones.csv")
print("  - estadisticas_por_metodo.csv")