# Notebook to finetune BERT like models

Adapted from HF-Tutorials https://huggingface.co/learn/nlp-course/chapter7/3?fw=pt



In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer

model_checkpoint = "distilbert/distilbert-base-multilingual-cased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [5]:
from datasets import load_dataset
leiko_dataset = load_dataset("text", data_files="/content/drive/MyDrive/nlp_ss24/multilingual-lexical-simplification/data/LeiKo_extracted/leiko_extracted.txt")
leiko_dataset

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 6593
    })
})

In [None]:
# sample = leiko_dataset["train"].shuffle(seed=42).select(range(3))

# for row in sample:
#     print(f"\n'>>> {row['text']}'")

In [6]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

# Use batched=True to activate fast multithreading!
tokenized_datasets = leiko_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

tokenized_datasets

Map:   0%|          | 0/6593 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 6593
    })
})

In [None]:
# tokenizer.model_max_length

In [7]:
chunk_size = 128

In [None]:
# # Slicing produces a list of lists for each feature
# tokenized_samples = tokenized_datasets["train"][:50]

# for idx, sample in enumerate(tokenized_samples["input_ids"]):
#     print(f"'>>> Sentence {idx} length: {len(sample)}'")

In [None]:
# concatenated_examples = {
#     k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
# }
# total_length = len(concatenated_examples["input_ids"])
# print(f"'>>> Concatenated reviews length: {total_length}'")

In [None]:
# chunks = {
#     k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
#     for k, t in concatenated_examples.items()
# }

# for chunk in chunks["input_ids"]:
#     print(f"'>>> Chunk length: {len(chunk)}'")

In [8]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [11]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/6593 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 761
    })
})

In [None]:
# tokenizer.decode(lm_datasets["train"][1]["input_ids"])

In [None]:
# tokenizer.decode(lm_datasets["train"][1]["labels"])

In [None]:
# from transformers import DataCollatorForLanguageModeling

# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
# samples = [lm_datasets["train"][i] for i in range(2)]
# for sample in samples:
#     _ = sample.pop("word_ids")

# for chunk in data_collator(samples)["input_ids"]:
#     print(f"\n'>>> {tokenizer.decode(chunk)}'")

In [9]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [None]:
# samples = [lm_datasets["train"][i] for i in range(2)]
# batch = whole_word_masking_data_collator(samples)

# for chunk in batch["input_ids"]:
#     print(f"\n'>>> {tokenizer.decode(chunk)}'")

In [12]:
train_size = 700
test_size = 61

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 700
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 61
    })
})

In [13]:
from transformers import TrainingArguments

batch_size = 32
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-leiko",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
    remove_unused_columns=False,
    num_train_epochs=6,
)



In [14]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=whole_word_masking_data_collator,
    tokenizer=tokenizer,
)

In [None]:
# import math

# eval_results = trainer.evaluate()
# print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [15]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,4.5704,3.686697
2,3.6733,3.526037
3,3.5139,3.332505
4,3.3701,3.318295
5,3.3137,3.323259
6,3.2595,3.392643


TrainOutput(global_step=132, training_loss=3.5999198826876553, metrics={'train_runtime': 58.0265, 'train_samples_per_second': 72.381, 'train_steps_per_second': 2.275, 'total_flos': 139476372940800.0, 'train_loss': 3.5999198826876553, 'epoch': 6.0})

In [None]:
# eval_results = trainer.evaluate()
# print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [16]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

events.out.tfevents.1718467251.f527b9a06272.593.0:   0%|          | 0.00/8.03k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/lusxvr/distilbert-base-multilingual-cased-finetuned-leiko/commit/18a8a8c988e63f57516709c7e272862f9ce72b77', commit_message='End of training', commit_description='', oid='18a8a8c988e63f57516709c7e272862f9ce72b77', pr_url=None, pr_revision=None, pr_num=None)

# Fine-tuning with HF Accelerate

In [None]:
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

In [None]:
downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"])
eval_dataset = downsampled_dataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=downsampled_dataset["test"].column_names,
)
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    }
)

Map:   0%|          | 0/61 [00:00<?, ? examples/s]

In [None]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 32
train_dataloader = DataLoader(
    downsampled_dataset["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)
eval_dataloader = DataLoader(
    eval_dataset, batch_size=batch_size, collate_fn=default_data_collator
)

In [None]:
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
from transformers import get_scheduler

num_train_epochs = 25
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
from huggingface_hub import get_full_repo_name

model_name = "distilbert-base-multilingual-cased-finetuned-leiko-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name

'lusxvr/distilbert-base-multilingual-cased-finetuned-leiko-accelerate'

In [None]:
# from huggingface_hub import create_repo
# create_repo(repo_name)

In [None]:
from huggingface_hub import Repository

output_dir = model_name
repo = Repository(output_dir, clone_from=repo_name)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/lusxvr/distilbert-base-multilingual-cased-finetuned-leiko-accelerate into local empty directory.


In [None]:
from tqdm.auto import tqdm
import torch
import math

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(batch_size)))

    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )

  0%|          | 0/550 [00:00<?, ?it/s]

>>> Epoch 0: Perplexity: 7.1698697970607626
>>> Epoch 1: Perplexity: 6.2363382744889
>>> Epoch 2: Perplexity: 5.521764352198796
>>> Epoch 3: Perplexity: 5.1001092981988405
>>> Epoch 4: Perplexity: 5.019019717155079
>>> Epoch 5: Perplexity: 4.6880871091602385
>>> Epoch 6: Perplexity: 4.6202818443511955
>>> Epoch 7: Perplexity: 4.553966456597788
>>> Epoch 8: Perplexity: 4.397104179381441
>>> Epoch 9: Perplexity: 4.195496540564888
>>> Epoch 10: Perplexity: 4.20063364230886
>>> Epoch 11: Perplexity: 4.070671598278374
>>> Epoch 12: Perplexity: 4.003188464916322
>>> Epoch 13: Perplexity: 3.9835107347080463
>>> Epoch 14: Perplexity: 4.02168265922657
>>> Epoch 15: Perplexity: 3.9279093757454566
>>> Epoch 16: Perplexity: 3.903507136658934
>>> Epoch 17: Perplexity: 3.9217395023624975
>>> Epoch 18: Perplexity: 3.9019262508738204
>>> Epoch 19: Perplexity: 3.820319955201782
>>> Epoch 20: Perplexity: 3.7919887960808127
>>> Epoch 21: Perplexity: 3.7569974240365407
>>> Epoch 22: Perplexity: 3.72856777