<a href="https://colab.research.google.com/github/NLP613-Metaplexia/assignment3/blob/main/bert_pre_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [None]:
import torch
from prettytable import PrettyTable
from datasets import load_dataset, DatasetDict
from evaluate import (evaluator, load)
from transformers import (AutoTokenizer,
                          AutoConfig,
                          AutoModelForCausalLM,
                          DataCollatorForLanguageModeling,
                          EarlyStoppingCallback,
                          Trainer,
                          TrainingArguments,
                        )

from huggingface_hub import notebook_login

notebook_login()
# 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

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
torch.backends.cudnn.deterministic = True

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
early_stopping = EarlyStoppingCallback(early_stopping_patience=3)
context_length = 128


# Utils

In [None]:
def my_preplexity(model_name):
    perplexity = load("perplexity", module_type="metric")
    input_texts = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")["text"][:10] # doctest: +SKIP # limiting the sample to 10
    input_texts = [s for s in input_texts if s!='']
    results = perplexity.compute(model_id=model_name,
                                predictions=input_texts,
                                add_start_token=False)
    # Print the mean perplexity value rounded to 2 decimal places
    print(round(results["mean_perplexity"], 2)) # doctest: +SKIP


#_______________________________________________________________________________________________
# > https://stackoverflow.com/questions/68058647/initialize-huggingface-bert-with-random-weights
def randomize_model(model):
    for module_ in model.named_modules():
        if isinstance(module_[1],(torch.nn.Linear, torch.nn.Embedding)):
            module_[1].weight.data.normal_(mean=0.0, std=model.config.initializer_range)
        elif isinstance(module_[1], torch.nn.LayerNorm):
            module_[1].bias.data.zero_()
            module_[1].weight.data.fill_(1.0)
        if isinstance(module_[1], torch.nn.Linear) and module_[1].bias is not None:
            module_[1].bias.data.zero_()
    return model

def model_size_and_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    model_size = sum(t.numel() for t in model.parameters())
    print(f"bert-base-uncased size: {model_size/1000**2:.1f}M parameters")
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params += params
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params


def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
        padding="max_length",
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        # if length == context_length:
        input_batch.append(input_ids)
    return {"input_ids": input_batch}



# Bert Base Uncased

In [None]:
config = AutoConfig.from_pretrained("bert-base-uncased")
model = AutoModelForCausalLM.from_config(config)

my_preplexity("bert-base-uncased")

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


  0%|          | 0/1 [00:00<?, ?it/s]

738800.38


In [None]:
model_size_and_parameters(model)

bert-base-uncased size: 109.5M parameters
+---------------------------------------------------------+------------+
|                         Modules                         | Parameters |
+---------------------------------------------------------+------------+
|          bert.embeddings.word_embeddings.weight         |  23440896  |
|        bert.embeddings.position_embeddings.weight       |   393216   |
|       bert.embeddings.token_type_embeddings.weight      |    1536    |
|             bert.embeddings.LayerNorm.weight            |    768     |
|              bert.embeddings.LayerNorm.bias             |    768     |
|     bert.encoder.layer.0.attention.self.query.weight    |   589824   |
|      bert.encoder.layer.0.attention.self.query.bias     |    768     |
|      bert.encoder.layer.0.attention.self.key.weight     |   589824   |
|       bert.encoder.layer.0.attention.self.key.bias      |    768     |
|     bert.encoder.layer.0.attention.self.value.weight    |   589824   |
|      be

109514298

# randomized model

In [None]:
rand_model = randomize_model(model)
rand_model.push_to_hub("rand_model", use_auth_token="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")
tokenizer.push_to_hub("rand_model", use_auth_token="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")

my_preplexity("temporary0-0name/rand_model")



If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


  0%|          | 0/1 [00:00<?, ?it/s]

31341.62


# Process Data

In [None]:
ds_train = load_dataset("wikitext","wikitext-2-raw-v1",split="train")
ds_valid = load_dataset("wikitext","wikitext-2-raw-v1",split="validation")

raw_datasets = DatasetDict(
    {
        "train": ds_train,  # .shuffle().select(range(50000)),
        "valid": ds_valid,  # .shuffle().select(range(500))
    }
)

tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)

print(f'Raw_datasets',raw_datasets)
print(f'Tokenized_datasets',tokenized_datasets)

Raw_datasets DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    valid: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})
Tokenized_datasets DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 46621
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 4783
    })
})


In [None]:
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


input_ids shape: torch.Size([5, 128])
attention_mask shape: torch.Size([5, 128])
labels shape: torch.Size([5, 128])


# Train the Model

In [None]:
# Optimal hyperparameters

learning_rate, batch, epoch, weigh_decay = (0.0003, 64, 10, 0.1)
learning_rate, batch, epoch, weigh_decay = (0.0003, 32, 10, 0.1)

In [None]:


args = TrainingArguments(
    output_dir="run_opt",
    per_device_train_batch_size=batch,
    per_device_eval_batch_size=batch,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_steps=100,
    gradient_accumulation_steps=8,
    num_train_epochs=epoch,
    weight_decay=weigh_decay,
    warmup_steps=100,
    lr_scheduler_type="cosine",
    learning_rate=learning_rate,
    save_steps=100,
    fp16=True,
    push_to_hub=True,
    load_best_model_at_end=True,

)

trainer = Trainer(
    model=rand_model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    callbacks=[early_stopping],
)
trainer.train()
trainer.push_to_hub()

my_preplexity("temporary0-0name/run_opt")

Step,Training Loss,Validation Loss
100,7.6252,6.411256
200,4.839,2.038549
300,0.9137,0.310766
400,0.171,0.087743
500,0.0542,0.039603
600,0.025,0.024211
700,0.0148,0.017986
800,0.0098,0.014752
900,0.0077,0.012991
1000,0.006,0.012051


Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

5.02
