In [3]:
!pip install datasets transformers accelerate


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
# notebooks/01_pretrain_demo.ipynb

# --------------------
# Notebook: Preentrenamiento de BERT pequeño (solo MLM)
# --------------------

"""
Este notebook entrena un modelo BERT pequeño desde cero usando solo el objetivo MLM (Masked Language Modeling).
Corpus: Wikipedia simplificada (subconjunto).
"""

# 🧩 Instalar dependencias (si se usa en Colab, descomentar):
# !pip install datasets transformers accelerate

from datasets import load_dataset
from transformers import BertTokenizerFast, DataCollatorForLanguageModeling
from transformers import BertForMaskedLM, Trainer, TrainingArguments

import torch
import os

# 🔹 Parámetros
MODEL_NAME = "bert-base-uncased"
SAVE_DIR = "../models/bert_pretrained"
BATCH_SIZE = 64
EPOCHS = 1

# 📥 Descargar corpus de Wikipedia simplificada
print("Cargando corpus...")
dataset = load_dataset("wikipedia", "20220301.simple", split='train[:1%]')  # solo 1% para prueba rápida

# 🔤 Tokenización
print("Tokenizando...")
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["title", "text"])

# 🧱 Collator para MLM
collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

# 🧠 Modelo
model = BertForMaskedLM.from_pretrained(MODEL_NAME)


# ⚙️ Entrenamiento
training_args = TrainingArguments(
    output_dir=SAVE_DIR,
    overwrite_output_dir=True,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    logging_dir="../logs",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=collator
)

print("Entrenando modelo...")
trainer.train()

print(f"Modelo guardado en {SAVE_DIR}")
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)


  from .autonotebook import tqdm as notebook_tqdm


Cargando corpus...
Tokenizando...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Entrenando modelo...


Step,Training Loss


Modelo guardado en ../models/bert_pretrained


('../models/bert_pretrained/tokenizer_config.json',
 '../models/bert_pretrained/special_tokens_map.json',
 '../models/bert_pretrained/vocab.txt',
 '../models/bert_pretrained/added_tokens.json',
 '../models/bert_pretrained/tokenizer.json')

In [2]:
from transformers import pipeline, BertForMaskedLM, BertTokenizerFast

model_path = "../models/bert_pretrained"

tokenizer = BertTokenizerFast.from_pretrained(model_path)
model = BertForMaskedLM.from_pretrained(model_path)

fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

# Prueba con una frase
resultados = fill_mask("The capital of France is [MASK].")

for res in resultados:
    print(f"{res['sequence']} (score: {res['score']:.4f})")


the capital of france is paris. (score: 0.5561)
the capital of france is lyon. (score: 0.0583)
the capital of france is lille. (score: 0.0443)
the capital of france is marseille. (score: 0.0352)
the capital of france is toulouse. (score: 0.0249)
