In [None]:
"""
Fine-tuning de modelo BERT preentrenado para clasificación de sentimientos.
Tarea: binaria (positivo = 1, negativo = 0) usando corpus Wikipedia simplificado.
Comparación entre full fine-tuning y LoRA.
"""

import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from peft import get_peft_model, LoraConfig, TaskType
from time import time

#  Directorios
MODEL_NAME = "models/bert_pretrained"
SAVE_FULL = "models/bert_finetuned_full"
SAVE_LORA = "models/bert_finetuned_lora"
LOG_DIR = "logs/finetune"
os.makedirs(SAVE_FULL, exist_ok=True)
os.makedirs(SAVE_LORA, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)

#  Parámetros
BATCH_SIZE = 8
EPOCHS = 1

#  Cargar dataset binario
print("Cargando corpus...")
dataset = load_dataset("wikipedia", "20220301.simple", split="train[:1%]")

def crear_dataset_clasificacion(ejemplos):
    textos = ejemplos["text"]
    pares = []
    for texto in textos:
        texto_lower = texto.lower()
        if any(p in texto_lower for p in ["good", "excellent", "amazing", "love", "great", "happy", "positive"]):
            label = 1
        elif any(n in texto_lower for n in ["bad", "hate", "terrible", "awful", "worst", "sad", "negative"]):
            label = 0
        else:
            continue
        pares.append({"text": texto, "label": label})
    return pares

pares = crear_dataset_clasificacion(dataset)
dataset_clasificacion = Dataset.from_list(pares)

#  Tokenización
print("Tokenizando...")
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_dataset = dataset_clasificacion.map(tokenize_function)
collator = DataCollatorWithPadding(tokenizer=tokenizer)

#  Función común de entrenamiento
def entrenar(model, output_dir):
    args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        save_steps=500,
        logging_steps=100,
        logging_dir=LOG_DIR,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_dataset,
        data_collator=collator,
        tokenizer=tokenizer
    )

    start = time()
    trainer.train()
    end = time()

    print(f"Entrenamiento completado en {end - start:.2f} segundos")
    model.save_pretrained(output_dir)

#  Fine-tuning completo
print("\n Fine-tuning completo...")
model_full = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
entrenar(model_full, SAVE_FULL)

#  Fine-tuning con LoRA
print("\n Fine-tuning con LoRA...")
base_model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=4,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none"
)

model_lora = get_peft_model(base_model, peft_config)
model_lora.print_trainable_parameters()

# Asegurar modo entrenamiento
model_lora.train()

# Entrenar y guardar adaptadores PEFT
entrenar(model_lora, SAVE_LORA)


Cargando corpus...


Downloading builder script: 100%|██████████████████████████████████████████████████| 36.7k/36.7k [00:00<00:00, 23.3MB/s]
Downloading readme: 100%|██████████████████████████████████████████████████████████| 16.0k/16.0k [00:00<00:00, 14.1MB/s]


The repository for wikipedia contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/wikipedia.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data: 100%|██████████████████████████████████████████████████████████████| 134M/134M [00:11<00:00, 11.6MB/s]
Generating train split: 100%|████████████████████████████████████████| 205328/205328 [00:01<00:00, 165803.53 examples/s]


Tokenizando...


Map: 100%|████████████████████████████████████████████████████████████████████| 872/872 [00:03<00:00, 245.17 examples/s]



🔹 Fine-tuning completo...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at models/bert_pretrained and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
100,0.4924


Entrenamiento completado en 1414.46 segundos

🔸 Fine-tuning con LoRA...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at models/bert_pretrained and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 148,994 || all params: 109,632,772 || trainable%: 0.1359


Step,Training Loss
100,0.5628


Entrenamiento completado en 1077.71 segundos


# LoRA guarda solo los adaptadores, no el modelo completo. 

In [12]:
from transformers import BertTokenizerFast, BertForSequenceClassification
from peft import PeftModel, PeftConfig

# Ruta a adaptadores entrenados
LORA_DIR = "models/bert_finetuned_lora"

# Cargar config de PEFT
config = PeftConfig.from_pretrained(LORA_DIR)

# Cargar base + adaptadores
base_model = BertForSequenceClassification.from_pretrained(config.base_model_name_or_path)
model_lora = PeftModel.from_pretrained(base_model, LORA_DIR)

# Tokenizer
tokenizer = BertTokenizerFast.from_pretrained(config.base_model_name_or_path)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at models/bert_pretrained and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
#  Verificación post-entrenamiento: cargar adaptadores LoRA correctamente
print("\n Verificando modelo LoRA guardado...")
from peft import PeftModel, PeftConfig

# Ruta a adaptadores entrenados
LORA_DIR = SAVE_LORA

# Cargar config de PEFT
config = PeftConfig.from_pretrained(LORA_DIR)

# Cargar base + adaptadores
base_model = BertForSequenceClassification.from_pretrained(config.base_model_name_or_path)
model_lora = PeftModel.from_pretrained(base_model, LORA_DIR)

# Tokenizer también debe venir del modelo base
tokenizer = BertTokenizerFast.from_pretrained(config.base_model_name_or_path)

# EjempLITO  de prueba rápida
sample = "This is the worst book I've read!"
inputs = tokenizer(sample, return_tensors="pt", truncation=True)
outputs = model_lora(**inputs)
pred = torch.argmax(outputs.logits, dim=1).item()

print(f"Texto: {sample}")
print(f"Predicción: {'Positivo' if pred == 0 else 'Negativo'}")



 Verificando modelo LoRA guardado...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at models/bert_pretrained and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Texto: This is the worst book I've read!
Predicción: Negativo


In [11]:
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification
from peft import PeftModel, PeftConfig

#  Rutas
FULL_DIR = "models/bert_finetuned_full"
LORA_DIR = "models/bert_finetuned_lora"

#  Cargar modelo Full Fine-Tuning
model_full = BertForSequenceClassification.from_pretrained(FULL_DIR)
tokenizer = BertTokenizerFast.from_pretrained(FULL_DIR)
model_full.eval()

#  Cargar modelo LoRA Fine-Tuning
config_lora = PeftConfig.from_pretrained(LORA_DIR)
base_model = BertForSequenceClassification.from_pretrained(config_lora.base_model_name_or_path)
model_lora = PeftModel.from_pretrained(base_model, LORA_DIR)
model_lora.eval()

#  Lista de textos de prueba
textos = [
    "I love this movie, it was excellent!",
    "This is the worst experience I’ve ever had.",
    "The book was good, but not amazing.",
    "I hate the food, it made me sick.",
    "What a bad day!",
    "This made me really happy today.",
    "Absolutely terrible service, never again.",
]

#  Función de predicción
def predecir(model, tokenizer, texto):
    inputs = tokenizer(texto, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        pred = torch.argmax(outputs.logits, dim=1).item()
    return "Positivo" if pred == 0 else "Negativo"

#  Mostrar resultados
print("\n Comparación de predicciones:")
print(f"{'Texto':<60} {'Full':<10} {'LoRA':<10}")
print("-" * 85)
for texto in textos:
    pred_full = predecir(model_full, tokenizer, texto)
    pred_lora = predecir(model_lora, tokenizer, texto)
    print(f"{texto[:55]:<60} {pred_full:<10} {pred_lora:<10}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at models/bert_pretrained and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 Comparación de predicciones:
Texto                                                        Full       LoRA      
-------------------------------------------------------------------------------------
I love this movie, it was excellent!                         Negativo   Negativo  
This is the worst experience I’ve ever had.                  Negativo   Positivo  
The book was good, but not amazing.                          Negativo   Positivo  
I hate the food, it made me sick.                            Negativo   Positivo  
What a bad day!                                              Negativo   Positivo  
This made me really happy today.                             Negativo   Negativo  
Absolutely terrible service, never again.                    Negativo   Positivo  
