In [None]:
# src/pretrain.py

"""
Preentrenamiento de un modelo BERT pequeño usando solo MLM.
Corpus: Wikipedia simplificada (subconjunto 1%).
"""

import os
from datasets import load_dataset
from transformers import (
    BertTokenizerFast,
    BertForMaskedLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from sklearn.metrics import accuracy_score
import torch

#  Directorios
MODEL_NAME = "bert-base-uncased"
SAVE_DIR = "models/bert_pretrained"
LOG_DIR = "logs/pretraining"
os.makedirs(SAVE_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)

# ⚙️ Hiperparámetros
BATCH_SIZE = 8
EPOCHS = 1

#  Cargar dataset (Wikipedia simple, 1%)
print("Cargando corpus...")
dataset = load_dataset("wikipedia", "20220301.simple", split='train[:1%]')

#  Tokenización
print("Tokenizando...")
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["title", "text"])

#  Collator para MLM
collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

#  Modelo BERT para MLM
model = BertForMaskedLM.from_pretrained(MODEL_NAME)

# ⚙️ Entrenamiento
training_args = TrainingArguments(
    output_dir=SAVE_DIR,
    overwrite_output_dir=True,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    logging_dir=LOG_DIR,
    report_to="none"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    acc = accuracy_score(labels.flatten(), predictions.flatten())
    return {"accuracy": acc}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=collator,
    compute_metrics=compute_metrics
)

#  Entrenar
print("Entrenando modelo...")
trainer.train()


#  Guardar modelo y tokenizer
print(f"Guardando modelo en {SAVE_DIR}")
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)


Cargando corpus...
Tokenizando...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Entrenando modelo...


Step,Training Loss


In [2]:
from transformers import pipeline

# Cargar pipeline con tu modelo preentrenado
fill_mask = pipeline(
    "fill-mask",
    model="models/bert_pretrained",
    tokenizer="models/bert_pretrained"
)

# Probar con una oración con [MASK]
resultados = fill_mask("The capital of France is [MASK].")

for r in resultados:
    print(f"{r['sequence']} (score: {r['score']:.4f})")


the capital of france is paris. (score: 0.4868)
the capital of france is lyon. (score: 0.0632)
the capital of france is tours. (score: 0.0422)
the capital of france is nice. (score: 0.0361)
the capital of france is toulouse. (score: 0.0344)
