# Fine-tuning a masked language model

In [1]:
from transformers import AutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)



In [2]:
distilbert_num_parameters = model.num_parameters() / 1_000_000

In [3]:
print(f"'>>> DistilBERT number of parameters: {round(distilbert_num_parameters)}M'")
print(f"'>>> BERT number of parameters: 110M'")

'>>> DistilBERT number of parameters: 67M'
'>>> BERT number of parameters: 110M'


In [4]:
text = "This is a great [MASK]."

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [6]:
import torch

inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> This is a great deal.'
'>>> This is a great success.'
'>>> This is a great adventure.'
'>>> This is a great idea.'
'>>> This is a great feat.'


In [7]:
from datasets import load_dataset

imdb_dataset = load_dataset("imdb")
imdb_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [8]:
sample = imdb_dataset["train"].shuffle(seed=42).select(range(3))

for row in sample:
    print(f"\n'>>> Review: {row['text']}'")
    print(f"'>>> Label: {row['label']}'")


'>>> Review: There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier's plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it's the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...'
'>>> Label: 1'

'>>> Review: This movie is a great. The plot is very true to the book which is a classic written by Mark Twain. The movie starts of with a scene where Hank sings a song with a bunch of kids called "when you stu

In [9]:
# 创建 unsupervised 分割的随机样本
unsupervised_sample = imdb_dataset["unsupervised"].shuffle(seed=42).select(range(3))

print("=== Unsupervised Split Sample ===")
for i, row in enumerate(unsupervised_sample):
    print(f"\n>>> Sample {i+1}:")
    print(f"Text: {row['text'][:100]}...")  # 只显示前100个字符
    print(f"Label: {row['label']}")

# 验证 unsupervised 分割的标签
unsupervised_labels = set(imdb_dataset["unsupervised"]["label"])
print(f"\n>>> Unique labels in unsupervised split: {unsupervised_labels}")
print(f">>> Labels are neither 0 nor 1: {not any(label in [0, 1] for label in unsupervised_labels)}")

# 检查 train 和 test 分割中的标签确实是 0 或 1
train_labels = set(imdb_dataset["train"]["label"])
test_labels = set(imdb_dataset["test"]["label"])

print(f"\n>>> Unique labels in train split: {train_labels}")
print(f">>> Train labels are 0 or 1: {train_labels == {0, 1}}")

print(f"\n>>> Unique labels in test split: {test_labels}")
print(f">>> Test labels are 0 or 1: {test_labels == {0, 1}}")


=== Unsupervised Split Sample ===

>>> Sample 1:
Text: If you've seen the classic Roger Corman version starring Vincent Price it's hard to put it out of yo...
Label: -1

>>> Sample 2:
Text: For me, this was the most moving film of the decade. Samira Makhmalbaf shows pure bravery and vision...
Label: -1

>>> Sample 3:
Text: There really isn't much to say about this "film". It has the odd smile or chuckle moment, but on the...
Label: -1

>>> Unique labels in unsupervised split: {-1}
>>> Labels are neither 0 nor 1: True

>>> Unique labels in train split: {0, 1}
>>> Train labels are 0 or 1: True

>>> Unique labels in test split: {0, 1}
>>> Test labels are 0 or 1: True


In [10]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = imdb_dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)
tokenized_datasets

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (532 > 512). Running this sequence through the model will result in indexing errors


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 50000
    })
})

In [11]:
# 打印前三个word
print("=== First 3 tokenized examples ===")
for i in range(3):
    example = tokenized_datasets["train"][i]
    print(f"\n>>> Example {i+1}:")
    print(f"Input IDs length: {len(example['input_ids'])}")
    print(f"First 10 input IDs: {example['input_ids'][:10]}")
    print(f"First 10 tokens: {tokenizer.convert_ids_to_tokens(example['input_ids'][:10])}")
    if 'word_ids' in example:
        print(f"First 10 word IDs: {example['word_ids'][:10]}")


=== First 3 tokenized examples ===

>>> Example 1:
Input IDs length: 363
First 10 input IDs: [101, 1045, 12524, 1045, 2572, 8025, 1011, 3756, 2013, 2026]
First 10 tokens: ['[CLS]', 'i', 'rented', 'i', 'am', 'curious', '-', 'yellow', 'from', 'my']
First 10 word IDs: [None, 0, 1, 2, 3, 4, 5, 6, 7, 8]

>>> Example 2:
Input IDs length: 304
First 10 input IDs: [101, 1000, 1045, 2572, 8025, 1024, 3756, 1000, 2003, 1037]
First 10 tokens: ['[CLS]', '"', 'i', 'am', 'curious', ':', 'yellow', '"', 'is', 'a']
First 10 word IDs: [None, 0, 1, 2, 3, 4, 5, 6, 7, 8]

>>> Example 3:
Input IDs length: 133
First 10 input IDs: [101, 2065, 2069, 2000, 4468, 2437, 2023, 2828, 1997, 2143]
First 10 tokens: ['[CLS]', 'if', 'only', 'to', 'avoid', 'making', 'this', 'type', 'of', 'film']
First 10 word IDs: [None, 0, 1, 2, 3, 4, 5, 6, 7, 8]


In [12]:
tokenizer.model_max_length

512

In [13]:
chunk_size = 128

In [14]:
# Slicing produces a list of lists for each feature
tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

'>>> Review 0 length: 363'
'>>> Review 1 length: 304'
'>>> Review 2 length: 133'


In [15]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

'>>> Concatenated reviews length: 800'


In [16]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 32'


In [17]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [18]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 61291
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 59904
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 122957
    })
})

In [19]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

"as the vietnam war and race issues in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men. < br / > < br / > what kills me about i am curious - yellow is that 40 years ago, this was considered pornographic. really, the sex and nudity scenes are few and far between, even then it ' s not shot like some cheaply made porno. while my countrymen mind find it shocking, in reality sex and nudity are a major staple in swedish cinema. even ingmar bergman,"

In [20]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [21]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] i rented i am [MASK] [MASK] yellow [MASK] my video store because of all the dh that surrounded it when it was first [MASK] in 1967. i also heard that [MASK] first it was seized by u. s. [MASK] if it [MASK] tried to enter this country [MASK] therefore being a fan of films considered [MASK] [MASK] " [MASK] really had to [MASK] [MASK] for myself. < br / > < br / [MASK] the plot is centered around a [MASK] swedish prices student named lena who wants to learn everything she can [MASK] [MASK]. in particular she wants to focus her attentions to making some sort of documentary on what the [MASK] swede thought about certain political issues [MASK]'

'>>> as the vietnam war and race [MASK] [MASK] the united states. in between asking [MASK] and ordinary denizens of stockholm about their opinions on politics, she has [MASK] with her [MASK] teacher, classmates, and married men. < br / > [MASK] br / > what kills me about flicking am [MASK] - yellow is that 40 years ago, this was consider

In [22]:
train_size = 10_000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1000
    })
})

In [None]:
from huggingface_hub import login

hf_token = "xxx"
login(hf_token)

In [24]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(lm_datasets["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-imdb",
    overwrite_output_dir=True,
    eval_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
)

In [25]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [26]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

* Trackio project initialized: huggingface
* Trackio metrics logged to: /root/autodl-tmp/huggingface/trackio
* View dashboard by running in your terminal:
[1m[93mtrackio show --project "huggingface"[0m
* or by running in Python: trackio.show(project="huggingface")
>>> Perplexity: 22.83


In [27]:
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time
1,2.5352,2.35684,0.0016
2,2.4244,2.312308,0.0016
3,2.3893,2.298297,0.0016


  [2m2025-09-25T11:15:57.258511Z[0m [33m WARN[0m  [33mReqwest(reqwest::Error { kind: Request, url: "https://cas-server.xethub.hf.co/xorb/default/336d32ca37f6c4ca2e53c28395e02670c33d65fc76499f5807fc2b6976ee33ac", source: hyper_util::client::legacy::Error(SendRequest, hyper::Error(BodyWrite, Os { code: 110, kind: TimedOut, message: "Connection timed out" })) }). Retrying...[0m
    [2;3mat[0m /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:226

  [2m2025-09-25T11:15:57.258544Z[0m [33m WARN[0m  [33mReqwest(reqwest::Error { kind: Request, url: "https://cas-server.xethub.hf.co/xorb/default/6fdb39ceb92083bc051a0205edbaddb0e7d5ea5743f1aeda24ecb9223487ba4d", source: hyper_util::client::legacy::Error(SendRequest, hyper::Error(BodyWrite, Os { code: 110, kind: TimedOut, message: "Connection timed out" })) }). Retrying...[0m
    [2;3mat[0m /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:226

* Run finished. Uploading logs to Trackio Space: http://

TrainOutput(global_step=2874, training_loss=2.4496594533873832, metrics={'train_runtime': 616.4313, 'train_samples_per_second': 298.286, 'train_steps_per_second': 4.662, 'total_flos': 6093604363709952.0, 'train_loss': 2.4496594533873832, 'epoch': 3.0})

In [28]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

>>> Perplexity: 9.90


In [29]:
trainer.push_to_hub()


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...ed-finetuned-imdb/training_args.bin: 100%|##########| 5.84kB / 5.84kB            

  ...ntainer-8550119d52-9fdc5a82.87169.0: 100%|##########| 5.21kB / 5.21kB            

  ...ed-finetuned-imdb/model.safetensors:  16%|#5        | 41.9MB /  268MB            

  ...tainer-8550119d52-9fdc5a82.100267.0: 100%|##########| 7.20kB / 7.20kB            

  ...tainer-8550119d52-9fdc5a82.100267.1: 100%|##########|   425B /   425B            

CommitInfo(commit_url='https://huggingface.co/yiwenX/distilbert-base-uncased-finetuned-imdb/commit/a87d3c383991515ee581d0ddf538620d59923394', commit_message='End of training', commit_description='', oid='a87d3c383991515ee581d0ddf538620d59923394', pr_url=None, repo_url=RepoUrl('https://huggingface.co/yiwenX/distilbert-base-uncased-finetuned-imdb', endpoint='https://huggingface.co', repo_type='model', repo_id='yiwenX/distilbert-base-uncased-finetuned-imdb'), pr_revision=None, pr_num=None)

In [30]:
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

In [31]:
downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"])
eval_dataset = downsampled_dataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=downsampled_dataset["test"].column_names,
)
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    }
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [32]:
eval_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

In [33]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 64
train_dataloader = DataLoader(
    downsampled_dataset["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)
eval_dataloader = DataLoader(
    eval_dataset, batch_size=batch_size, collate_fn=default_data_collator
)

In [34]:
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [35]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [36]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [41]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [37]:
from huggingface_hub import get_full_repo_name

model_name = "distilbert-base-uncased-finetuned-imdb-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name

'yiwenX/distilbert-base-uncased-finetuned-imdb-accelerate'

In [39]:
from huggingface_hub import HfApi
import os

output_dir = model_name
# Create the directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Initialize HfApi for repository operations
api = HfApi()

# Create the repository on Hugging Face Hub if it doesn't exist
try:
    api.create_repo(repo_id=repo_name, exist_ok=True)
    print(f"Repository {repo_name} is ready")
except Exception as e:
    print(f"Repository creation info: {e}")

Repository yiwenX/distilbert-base-uncased-finetuned-imdb-accelerate is ready


In [43]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [45]:
from tqdm.auto import tqdm
import torch
import math

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(batch_size)))

    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        # 直接使用 API 推送到 Hub，而不是使用未定义的 repo
        api.upload_folder(
            folder_path=output_dir,
            repo_id=repo_name,
            commit_message=f"Training in progress epoch {epoch}"
        )

  0%|          | 0/471 [00:00<?, ?it/s]

>>> Epoch 0: Perplexity: 11.294586710944655


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...d-imdb-accelerate/model.safetensors:   4%|3         | 10.3MB /  268MB            

>>> Epoch 1: Perplexity: 11.095003322910616


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...d-imdb-accelerate/model.safetensors:   0%|          | 1.08MB /  268MB            

>>> Epoch 2: Perplexity: 11.095003322910616


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...d-imdb-accelerate/model.safetensors:  16%|#5        | 41.8MB /  268MB            

No files have been modified since last commit. Skipping to prevent empty commit.


In [46]:
from transformers import pipeline

mask_filler = pipeline(
    "fill-mask", model="yiwenX/distilbert-base-uncased-finetuned-imdb-accelerate"
)

config.json:   0%|          | 0.00/500 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cuda:0


In [47]:
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

>>> this is a great film.
>>> this is a great movie.
>>> this is a great idea.
>>> this is a great adventure.
>>> this is a great one.
