In [2]:
# notebooks/02_finetune_classification.ipynb

# --------------------
# Notebook: Fine-tuning de BERT pequeño para clasificación de texto
# --------------------

"""
Este notebook ajusta finamente un modelo BERT para una tarea de clasificación de texto.
Usamos el corpus de Wikipedia simplificada para generar ejemplos artificiales de clasificación binaria.
"""

from datasets import load_dataset, Dataset
from transformers import BertTokenizerFast, DataCollatorWithPadding
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
import torch
import os
import random

#  Parámetros
MODEL_NAME = "../models/bert_pretrained"
SAVE_DIR = "../models/bert_finetuned"
BATCH_SIZE =8
EPOCHS = 1

#  Cargar corpus base
print("Cargando corpus...")
dataset = load_dataset("wikipedia", "20220301.simple", split="train[:1%]")

#  Crear un dataset de clasificación artificial (binaria)
def crear_dataset_clasificacion(ejemplos):
    textos = ejemplos["text"]
    pares = []
    for i in range(0, len(textos) - 1, 2):
        pares.append({
            "text": textos[i],
            "label": random.randint(0, 1)  # Etiqueta binaria aleatoria
        })
    return pares

pares = crear_dataset_clasificacion(dataset)
dataset_clasificacion = Dataset.from_list(pares)

#  Tokenización
print("Tokenizando...")
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_dataset = dataset_clasificacion.map(tokenize_function, batched=False)

#  Collator
collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 🧠 Modelo de clasificación
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

#  Entrenamiento
training_args = TrainingArguments(
    output_dir=SAVE_DIR,
    overwrite_output_dir=True,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    logging_dir="../logs",
    report_to="none"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    acc = (predictions == torch.tensor(labels)).float().mean().item()
    return {"accuracy": acc}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("Entrenando modelo...")
trainer.train()

print(f"Modelo guardado en {SAVE_DIR}")
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)


Cargando corpus...
Tokenizando...


Map: 100%|██████████████████████████████████████████████████████████████████| 1026/1026 [00:03<00:00, 328.35 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../models/bert_pretrained and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Entrenando modelo...


Step,Training Loss
100,0.7023


Modelo guardado en ../models/bert_finetuned


('../models/bert_finetuned/tokenizer_config.json',
 '../models/bert_finetuned/special_tokens_map.json',
 '../models/bert_finetuned/vocab.txt',
 '../models/bert_finetuned/added_tokens.json',
 '../models/bert_finetuned/tokenizer.json')

In [3]:
from transformers import BertTokenizerFast, BertForSequenceClassification
import torch

# Cargar modelo y tokenizer
MODEL_DIR = "../models/bert_finetuned"
tokenizer = BertTokenizerFast.from_pretrained(MODEL_DIR)
model = BertForSequenceClassification.from_pretrained(MODEL_DIR)

# Poner modelo en modo evaluación
model.eval()


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [4]:
def predecir(texto):
    # Tokenizar
    inputs = tokenizer(texto, return_tensors="pt", truncation=True, padding=True)
    
    # Inferencia
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        pred = torch.argmax(logits, dim=1).item()
    
    return pred


In [7]:
ejemplo1 = "This article explains the history of the internet and its protocols."
ejemplo2 = "Random gibberish without meaning and very little coherence."
ejemplo3 = "The Earth revolves around the Sun and not the other way around."

print("Predicción 1:", predecir(ejemplo1))  # → 0 o 1
print("Predicción 2:", predecir(ejemplo2))  # → 0 o 1
print("Predicción 3:", predecir(ejemplo3))  # → 0 o 1


Predicción 1: 0
Predicción 2: 0
Predicción 3: 0
