In [None]:
# src/fine_tune.py

"""
Fine-tuning de modelo BERT preentrenado para clasificación de sentimientos.
Tarea: binaria (positivo = 1, negativo = 0) usando corpus Wikipedia simplificado.
Comparación entre full fine-tuning y LoRA.
"""

import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from peft import get_peft_model, LoraConfig, TaskType
from time import time

# 📁 Directorios
MODEL_NAME = "models/bert_pretrained"
SAVE_FULL = "models/bert_finetuned_full"
SAVE_LORA = "models/bert_finetuned_lora"
LOG_DIR = "logs/finetune"
os.makedirs(SAVE_FULL, exist_ok=True)
os.makedirs(SAVE_LORA, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)

# ⚙️ Parámetros
BATCH_SIZE = 8
EPOCHS = 1

# 🧪 Dataset binario basado en palabras clave
print("Cargando corpus...")
dataset = load_dataset("wikipedia", "20220301.simple", split="train[:1%]")

def crear_dataset_clasificacion(ejemplos):
    textos = ejemplos["text"]
    pares = []
    for texto in textos:
        texto_lower = texto.lower()
        if any(p in texto_lower for p in ["good", "excellent", "amazing", "love", "great", "happy", "positive"]):
            label = 1
        elif any(n in texto_lower for n in ["bad", "hate", "terrible", "awful", "worst", "sad", "negative"]):
            label = 0
        else:
            continue
        pares.append({"text": texto, "label": label})
    return pares

pares = crear_dataset_clasificacion(dataset)
dataset_clasificacion = Dataset.from_list(pares)

# 🔤 Tokenización
print("Tokenizando...")
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_dataset = dataset_clasificacion.map(tokenize_function)

collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 🔁 Función para entrenamiento

def entrenar(model, output_dir):
    args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        save_steps=500,
        logging_steps=100,
        logging_dir=LOG_DIR,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_dataset,
        data_collator=collator,
        tokenizer=tokenizer
    )

    start = time()
    trainer.train()
    end = time()

    print(f"Entrenamiento completado en {end - start:.2f} segundos")
    model.save_pretrained(output_dir)


# 🔹 Full Fine-tuning
print("\n🔹 Fine-tuning completo...")
model_full = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
entrenar(model_full, SAVE_FULL)

# 🔹 Fine-tuning con LoRA
print("\n🔹 Fine-tuning con LoRA...")
model_lora = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=4,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none"
)

model_lora = get_peft_model(model_lora, peft_config)
model_lora.print_trainable_parameters()

entrenar(model_lora, SAVE_LORA)
