# Notebook Card: Pre-Training Bert model
* * *

This notebook describes how to pre-train `Bert` model for the dataset `wikitext-2-raw-v1`.


## Importing required *packages*

In [None]:
# Import the os module to interact with the operating system
import os

# Set the CUDA_VISIBLE_DEVICES environment variable to "1"
# This specifies which GPU device(s) should be visible and accessible to the program.
# In this case, it is set to GPU device with index 1.
os.environ["CUDA_VISIBLE_DEVICES"] = "1"


In [None]:
!pip install dataset
!pip install evaluate
!pip install transformers
!pip install prettytable

Collecting dataset
  Obtaining dependency information for dataset from https://files.pythonhosted.org/packages/9f/4d/f74a514b5c4efb5c1546160597715cd6096273d7173b36a3187d2afb663a/dataset-1.6.2-py2.py3-none-any.whl.metadata
  Downloading dataset-1.6.2-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting sqlalchemy<2.0.0,>=1.3.2 (from dataset)
  Obtaining dependency information for sqlalchemy<2.0.0,>=1.3.2 from https://files.pythonhosted.org/packages/ce/4b/b5afbaf063b18bd6decffe8d64184ca5ecb25cc6be2ffc1cd8664ac3a5c2/SQLAlchemy-1.4.50-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading SQLAlchemy-1.4.50-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10.0 kB)
Collecting alembic>=0.6.2 (from dataset)
  Obtaining dependency information for alembic>=0.6.2 from https://files.pythonhosted.org/packages/34/47/95d8f99c9f4a57079dfbcff5e023c5d81bde092d1c2354156340a56b3a1a/alembic

In [None]:
# Import necessary libraries
import torch
from prettytable import PrettyTable
from datasets import load_dataset, DatasetDict
from evaluate import (evaluator, load)
from transformers import (AutoTokenizer,
                          AutoConfig,
                          AutoModelForCausalLM,
                          DataCollatorForLanguageModeling,
                          EarlyStoppingCallback,
                          Trainer,
                          TrainingArguments,
                        )

# Import Hugging Face Hub for notebook authentication
from huggingface_hub import notebook_login

# Authenticate with the Hugging Face Hub using the provided token
# Replace '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' with your actual Hugging Face Hub token
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Fixing Random Seeds

In [None]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
torch.backends.cudnn.deterministic = True

In [None]:
# Load the pre-trained BERT tokenizer for the 'bert-base-uncased' model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Add a special token for padding with the label '[PAD]'
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Create a data collator for language modeling using the specified tokenizer and disable masking (mlm=False)
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# Create an EarlyStoppingCallback with a patience of 3 epochs for early stopping during training
early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

# Set the context length to 128 (this may be used as a parameter in subsequent parts of the code)
context_length = 128


# Utils

This section contains the utility functions like




## Perplexity

In [None]:
def my_preplexity(model_name):
    perplexity = load("perplexity", module_type="metric")
    input_texts = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")["text"][:10] # doctest: +SKIP # limiting the sample to 10
    input_texts = [s for s in input_texts if s!='']
    results = perplexity.compute(model_id=model_name,
                                predictions=input_texts,
                                add_start_token=False)
    # Print the mean perplexity value rounded to 2 decimal places
    print(round(results["mean_perplexity"], 2)) # doctest: +SKIP

## Randomize Model

In [None]:
#_______________________________________________________________________________________________
# > https://stackoverflow.com/questions/68058647/initialize-huggingface-bert-with-random-weights
def randomize_model(model):
    # Iterate through all modules in the model
    for module_ in model.named_modules():
        # Check if the module is an instance of Linear or Embedding layer
        if isinstance(module_[1], (torch.nn.Linear, torch.nn.Embedding)):
            # Randomize the weights of Linear and Embedding layers
            module_[1].weight.data.normal_(mean=0.0, std=model.config.initializer_range)
        # Check if the module is an instance of LayerNorm
        elif isinstance(module_[1], torch.nn.LayerNorm):
            # Zero out the bias and set the weight to 1.0 for LayerNorm
            module_[1].bias.data.zero_()
            module_[1].weight.data.fill_(1.0)
        # Check if the module is an instance of Linear and has a bias
        if isinstance(module_[1], torch.nn.Linear) and module_[1].bias is not None:
            # Zero out the bias for Linear layers
            module_[1].bias.data.zero_()

    # Return the modified model
    return model

## Size and Parameter Calculation

In [None]:
def model_size_and_parameters(model):
    # Create a PrettyTable for displaying module-wise parameter information
    table = PrettyTable(["Modules", "Parameters"])

    # Calculate the total number of parameters in the model
    model_size = sum(t.numel() for t in model.parameters())

    # Print the total size of the model in megabytes
    print(f"bert-base-uncased size: {model_size/1000**2:.1f}M parameters")

    # Initialize a variable to keep track of the total trainable parameters
    total_params = 0

    # Iterate through named parameters of the model
    for name, parameter in model.named_parameters():
        # Check if the parameter requires gradient (i.e., is trainable)
        if not parameter.requires_grad:
            continue

        # Get the number of parameters in the current module
        params = parameter.numel()

        # Add a row to the PrettyTable with module name and number of parameters
        table.add_row([name, params])

        # Increment the total trainable parameters
        total_params += params

    # Print the PrettyTable with module-wise parameter information
    print(table)

    # Print the total number of trainable parameters in the model
    print(f"Total Trainable Params: {total_params}")

    # Return the total number of trainable parameters
    return total_params


## Tokenize

In [None]:
def tokenize(element):
    # Tokenize the input text using the specified tokenizer
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
        padding="max_length",
    )

    # Initialize an empty list to store the tokenized input batches
    input_batch = []

    # Iterate over the tokenized outputs and extract input_ids
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        # Append the input_ids to the input_batch list
        input_batch.append(input_ids)

    # Return a dictionary with the tokenized input batch
    return {"input_ids": input_batch}

# Bert Base Uncased

The bert-base-uncased model is being taken from PyTorch library using `AutoConfig` and CausalLM model is defined for it.

In [None]:
# Load the configuration for the 'bert-base-uncased' model
config = AutoConfig.from_pretrained("bert-base-uncased")

# Create a language model using the configuration
model = AutoModelForCausalLM.from_config(config)

# Assuming 'my_preplexity' is a function defined elsewhere in your code
# and it takes a model name (e.g., "bert-base-uncased") as an argument
# and calculates and prints perplexity for that model.
my_preplexity("bert-base-uncased")


If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


  0%|          | 0/1 [00:00<?, ?it/s]

738800.38


# Randomized model

The model picked from PyTorch library has been intialised with random weights to pre train the model.

In [None]:
# Assuming 'randomize_model' is a function defined elsewhere in your code
# that randomizes the weights of the given model and returns the modified model
rand_model = randomize_model(model)

# Push the randomized model to the Hugging Face Hub with the repository name "rand_model"
# Use the provided authentication token for authentication
rand_model.push_to_hub("run_opt1", use_auth_token="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")

# Push the tokenizer to the Hugging Face Hub with the repository name "rand_model"
# Use the provided authentication token for authentication
tokenizer.push_to_hub("run_opt1", use_auth_token="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")

# Assuming 'my_preplexity' is a function defined elsewhere in your code
# that takes a model name (e.g., "temporary0-0name/rand_model") as an argument
# and calculates and prints perplexity for the specified model.
my_preplexity("temporary0-0name/run_opt1")



pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


Downloading tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'perplexities': [32468.548828125, 38314.11328125, 34559.36328125, 20465.455078125, 30900.619140625], 'mean_perplexity': 31341.619921875}
31341.62


# Process Data

The dataset has been split for training and validation.

In [None]:
ds_train = load_dataset("wikitext","wikitext-2-raw-v1",split="train")
ds_valid = load_dataset("wikitext","wikitext-2-raw-v1",split="validation")

raw_datasets = DatasetDict(
    {
        "train": ds_train,  # .shuffle().select(range(50000)),
        "valid": ds_valid,  # .shuffle().select(range(500))
    }
)

tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)

print(f'Raw_datasets',raw_datasets)
print(f'Tokenized_datasets',tokenized_datasets)

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

Raw_datasets DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    valid: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})
Tokenized_datasets DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 46621
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 4783
    })
})


In [None]:
# Assuming 'data_collator' is a DataCollatorForLanguageModeling instance created earlier
# and 'tokenized_datasets' is a dictionary of tokenized datasets
# This code selects the first 5 elements from the tokenized training dataset
# and collates them using the 'data_collator' to prepare the input batch for training

# Create a list of the first 5 tokenized training examples
selected_tokenized_examples = [tokenized_datasets["train"][i] for i in range(5)]

# Use the 'data_collator' to collate the selected tokenized examples into a batch
out = data_collator(selected_tokenized_examples)

# Print the shapes of different components in the collated batch
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([5, 128])
attention_mask shape: torch.Size([5, 128])
labels shape: torch.Size([5, 128])


# Train the Model

The model is trained using grid search to obtain the optimal parameters. The hyper-parameters considered for tuning are `learning_rate`, `batch_size`, `epochs` and `weight_decay`. The optimal parameters are as follows:

In [None]:
# Optimal hyperparameters

learning_rate, batch, epoch, weigh_decay = (0.0003, 64, 10, 0.1)
learning_rate, batch, epoch, weigh_decay = (0.0003, 32, 10, 0.1)

## Train and Validation loss

The train and validation loss for the model are calculated for each epoch.


In [None]:
# Define training arguments for the Trainer
args = TrainingArguments(
    output_dir="run_opt1",
    per_device_train_batch_size=batch,
    per_device_eval_batch_size=batch,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_steps=100,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=weigh_decay,
    warmup_steps=100,
    lr_scheduler_type="cosine",
    learning_rate=learning_rate,
    save_steps=100,
    fp16=True,
    push_to_hub=True,
    load_best_model_at_end=True,
)

# # Create a Trainer instance for training the model
# # Load the configuration for the 'bert-base-uncased' model
config = AutoConfig.from_pretrained("temporary0-0name/run_opt1")

# # Create a language model using the configuration
rand_model = AutoModelForCausalLM.from_config(config)
for ep in range(10):
    trainer = Trainer(
        model=rand_model,  # Use the randomized model
        tokenizer=tokenizer,
        args=args,
        data_collator=data_collator,  # Assuming 'data_collator' is defined elsewhere
        train_dataset=tokenized_datasets["train"],  # Assuming 'tokenized_datasets' is defined elsewhere
        eval_dataset=tokenized_datasets["valid"],  # Assuming 'tokenized_datasets' is defined elsewhere
        callbacks=[early_stopping],  # List of callbacks, including early stopping
    )

    # Train the model using the Trainer
    trainer.train()

    # Push the trained model and tokenizer to the Hugging Face Hub
    trainer.push_to_hub()
    print(f'Epoch {ep} completed : perplexity ',end=' ')
    my_preplexity("temporary0-0name/run_opt1")
# Assuming 'my_preplexity' is a function defined elsewhere in your code
# that takes a model name (e.g., "temporary0-0name/run_opt") as an argument
# and calculates and prints perplexity for the specified model.



If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


Step,Training Loss,Validation Loss
100,7.6929,6.49369


Epoch 0 completed : perplexity  

Downloading config.json:   0%|          | 0.00/672 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


  0%|          | 0/1 [00:00<?, ?it/s]

{'perplexities': [2807.3671875, 793.2745361328125, 881.2412719726562, 16.042972564697266, 22.09195327758789], 'mean_perplexity': 904.0035842895508}
904.0


Step,Training Loss,Validation Loss
100,6.1137,5.325172


Epoch 1 completed : perplexity  

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


  0%|          | 0/1 [00:00<?, ?it/s]

{'perplexities': [283.079345703125, 368.9197998046875, 430.2932434082031, 3.576266288757324, 6.355835437774658], 'mean_perplexity': 218.44489812850952}
218.44


Step,Training Loss,Validation Loss
100,3.8451,1.905889


Epoch 2 completed : perplexity  

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


  0%|          | 0/1 [00:00<?, ?it/s]

{'perplexities': [9.577826499938965, 38.664329528808594, 36.60551834106445, 1.1674799919128418, 2.243891954421997], 'mean_perplexity': 17.65180926322937}
17.65


Step,Training Loss,Validation Loss
100,1.4776,0.751509


Epoch 3 completed : perplexity  

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


  0%|          | 0/1 [00:00<?, ?it/s]

{'perplexities': [2.3580613136291504, 20.175796508789062, 18.474390029907227, 1.0346851348876953, 1.1486788988113403], 'mean_perplexity': 8.638322377204895}
8.64


Step,Training Loss,Validation Loss
100,0.5631,0.30732


Epoch 4 completed : perplexity  

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


  0%|          | 0/1 [00:00<?, ?it/s]

{'perplexities': [1.1908371448516846, 15.733183860778809, 15.048104286193848, 1.0019620656967163, 1.009572148323059], 'mean_perplexity': 6.796731901168823}
6.8


Step,Training Loss,Validation Loss
100,0.2116,0.148655


Epoch 5 completed : perplexity  

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


  0%|          | 0/1 [00:00<?, ?it/s]

{'perplexities': [1.0319533348083496, 17.137741088867188, 12.95411491394043, 1.000580906867981, 1.0026648044586182], 'mean_perplexity': 6.625411009788513}
6.63


Step,Training Loss,Validation Loss
100,0.0848,0.094611


Epoch 6 completed : perplexity  

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


  0%|          | 0/1 [00:00<?, ?it/s]

{'perplexities': [1.0084221363067627, 16.63326644897461, 13.678961753845215, 1.0002695322036743, 1.0009080171585083], 'mean_perplexity': 6.664365577697754}
6.66


Step,Training Loss,Validation Loss
100,0.0384,0.068193


Epoch 7 completed : perplexity  

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


  0%|          | 0/1 [00:00<?, ?it/s]

{'perplexities': [1.0037353038787842, 17.505964279174805, 13.907333374023438, 1.0001869201660156, 1.0004057884216309], 'mean_perplexity': 6.883525133132935}
6.88


Step,Training Loss,Validation Loss
100,0.0202,0.063625


Epoch 8 completed : perplexity  

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


  0%|          | 0/1 [00:00<?, ?it/s]

{'perplexities': [1.0027633905410767, 25.743392944335938, 13.560998916625977, 1.0003864765167236, 1.0005950927734375], 'mean_perplexity': 8.46162736415863}
8.46


Step,Training Loss,Validation Loss
100,0.0126,0.054044


Epoch 9 completed : perplexity  

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


  0%|          | 0/1 [00:00<?, ?it/s]

{'perplexities': [1.0010924339294434, 19.15555191040039, 18.63957977294922, 1.000120997428894, 1.0001838207244873], 'mean_perplexity': 8.159305787086486}
8.16


| Epoch | Perplexity |
|-------|------------|
| 1     | 904.00     |
| 2     | 218.44     |
| 3     | 17.65      |
| 4     | 8.63       |
| 5     | 6.8        |
| 6     | 6.63       |
| 7     | 6.66       |
| 8     | 6.88       |
| 9     | 8.46       |
| 10    | 8.16       |

# Parameter Calculation

The total size of model and the number of parameters for each layer are calculate using `model_size_and_parameters()` function and the output is as follows

In [None]:
model_size_and_parameters(model) #calculate parameters of model

bert-base-uncased size: 109.5M parameters
+---------------------------------------------------------+------------+
|                         Modules                         | Parameters |
+---------------------------------------------------------+------------+
|          bert.embeddings.word_embeddings.weight         |  23440896  |
|        bert.embeddings.position_embeddings.weight       |   393216   |
|       bert.embeddings.token_type_embeddings.weight      |    1536    |
|             bert.embeddings.LayerNorm.weight            |    768     |
|              bert.embeddings.LayerNorm.bias             |    768     |
|     bert.encoder.layer.0.attention.self.query.weight    |   589824   |
|      bert.encoder.layer.0.attention.self.query.bias     |    768     |
|      bert.encoder.layer.0.attention.self.key.weight     |   589824   |
|       bert.encoder.layer.0.attention.self.key.bias      |    768     |
|     bert.encoder.layer.0.attention.self.value.weight    |   589824   |
|      be

109514298