<a href="https://colab.research.google.com/github/GarimaChopra/Generative_AI/blob/main/Pretraining_a_roBerta_model_ch3_ex1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **KantiBERT**: How to train a new language model from scratch using Transformers and Tokenizers


In [1]:
#@title Step 1: Loading the dataset

#1.Load kant.txt using the Colab file manager
#2.Downloading the file from GitHub

!curl -L https://raw.githubusercontent.com/PacktPublishing/Transformers-for-Natural-Language-Processing/master/Chapter03/kant.txt --output "kant.txt"




  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 10.7M  100 10.7M    0     0  10.5M      0  0:00:01  0:00:01 --:--:-- 10.5M


In [2]:
#@title Step 2: Installing Hugging Face transformers

# We won't need TensorFlow here
!pip uninstall -y tensorflow

# Install `transformers` from master
!pip install git+https://github.com/huggingface/transformers

!pip list | grep -E 'transformers|tokenizers'
# transformers version at notebook update --- 2.9.1
# tokenizers version at notebook update --- 0.7.0



Found existing installation: tensorflow 2.13.0
Uninstalling tensorflow-2.13.0:
  Successfully uninstalled tensorflow-2.13.0
Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-a5p8cn41
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-a5p8cn41
  Resolved https://github.com/huggingface/transformers to commit 2629c8f36ac57e546ea45e611536351289fe4944
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers==4.35.0.dev0)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers==4.35.0.dev0)
  Do

In [2]:
#@title Step 3: Training a Tokenizer

# print the CPU and wall times for the entire code
%%time

from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path(".").glob("**/*.txt")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2,
special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])


CPU times: user 8.09 s, sys: 176 ms, total: 8.26 s
Wall time: 5.98 s


In [3]:
#@title Step 4: Saving the files to disk

import os
token_dir = '/content/KantaiBERT'
if not os.path.exists(token_dir):
  os.makedirs(token_dir)
tokenizer.save_model('KantaiBERT')


['KantaiBERT/vocab.json', 'KantaiBERT/merges.txt']

In [4]:
#@title Step 5 Loading the Trained Tokenizer Files

from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
tokenizer = ByteLevelBPETokenizer(
    "./KantaiBERT/vocab.json",
    "./KantaiBERT/merges.txt",
)



In [5]:
#@title The tokenizer can now encode a sequence
tokenizer.encode("The Critique of Pure Reason.").tokens


['The', 'ĠCritique', 'Ġof', 'ĠPure', 'ĠReason', '.']

In [6]:
#@title The tokenizer can print the number of tokens in a sequence
tokenizer.encode("The Critique of Pure Reason.")


Encoding(num_tokens=6, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [7]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),  ## SEP token
    ("<s>", tokenizer.token_to_id("<s>")),    ## CLS token
)

tokenizer.enable_truncation(max_length=512)


In [8]:
tokenizer.encode("The Critique of Pure Reason.").tokens


['<s>', 'The', 'ĠCritique', 'Ġof', 'ĠPure', 'ĠReason', '.', '</s>']

In [9]:
#@title Step 6: Checking Resource Constraints: GPU and NVIDIA

!nvidia-smi


Mon Oct  9 08:23:17 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [10]:
#@title Checking that PyTorch Sees CUDA
import torch
torch.cuda.is_available()


True

In [11]:
#@title Step 7: Defining the configuration of the Model

from transformers import RobertaConfig
config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)


In [12]:
#@title Step 8: Re-creating the Tokenizer in Transformers

from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained("./KantaiBERT", max_length=512)


In [13]:
#@title Step 9: Initializing a Model From Scratch

from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

print(model)


RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

In [14]:
print(model.num_parameters())



83504416


In [15]:
#@title Exploring the Parameters

LP=list(model.parameters())
lp=len(LP)
print(lp)


106


In [16]:
for p in range(0,lp):
  print(LP[p])


Parameter containing:
tensor([[-0.0071, -0.0127, -0.0111,  ...,  0.0389,  0.0050,  0.0299],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0172, -0.0153,  0.0309,  ..., -0.0027, -0.0210,  0.0208],
        ...,
        [-0.0225,  0.0066,  0.0001,  ...,  0.0163,  0.0207,  0.0122],
        [ 0.0056,  0.0007,  0.0073,  ...,  0.0055, -0.0237,  0.0001],
        [-0.0072,  0.0307,  0.0215,  ...,  0.0020,  0.0286,  0.0100]],
       requires_grad=True)
Parameter containing:
tensor([[ 0.0316, -0.0198, -0.0078,  ..., -0.0085, -0.0179,  0.0189],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0032,  0.0084, -0.0217,  ...,  0.0086,  0.0118,  0.0070],
        ...,
        [ 0.0350, -0.0141, -0.0119,  ..., -0.0039, -0.0107,  0.0208],
        [ 0.0242, -0.0088, -0.0174,  ..., -0.0117, -0.0011, -0.0153],
        [ 0.0050,  0.0239, -0.0123,  ..., -0.0085,  0.0324,  0.0237]],
       requires_grad=True)
Parameter containing:
tensor([[ 2.

In [17]:
#@title Step 10: Building the Dataset

%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./kant.txt",
    block_size=128,
)




CPU times: user 25.2 s, sys: 487 ms, total: 25.7 s
Wall time: 29.1 s


In [18]:
#@title Step 11: Defining a Data Collator

from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)


In [27]:
import accelerate
import transformers

transformers.__version__, accelerate.__version__

('4.35.0.dev0', '0.23.0')

In [30]:
! pip install accelerate -U



In [19]:


#@title Step 12: Initializing the Trainer

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./KantaiBERT",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)



In [20]:
#@title Step 13: Pre-training the Model

%%time
trainer.train()


Step,Training Loss
500,6.5921
1000,5.7332
1500,5.2511
2000,5.0063
2500,4.8427


CPU times: user 10min 5s, sys: 2.53 s, total: 10min 7s
Wall time: 10min 17s


TrainOutput(global_step=2672, training_loss=5.440966783169501, metrics={'train_runtime': 617.2819, 'train_samples_per_second': 276.963, 'train_steps_per_second': 4.329, 'total_flos': 873939262999296.0, 'train_loss': 5.440966783169501, 'epoch': 1.0})

In [21]:
#@title Step 14: Saving the Final Model(+tokenizer + config) to disk

trainer.save_model("./KantaiBERT")


In [22]:
#@title Step 15: Language Modeling with the FillMaskPipeline

from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./KantaiBERT",
    tokenizer="./KantaiBERT"
)


fill_mask("Human thinking involves human <mask>.")



[{'score': 0.04720856994390488,
  'token': 394,
  'token_str': ' reason',
  'sequence': 'Human thinking involves human reason.'},
 {'score': 0.020105119794607162,
  'token': 535,
  'token_str': ' experience',
  'sequence': 'Human thinking involves human experience.'},
 {'score': 0.011545337736606598,
  'token': 396,
  'token_str': ' object',
  'sequence': 'Human thinking involves human object.'},
 {'score': 0.011209112592041492,
  'token': 584,
  'token_str': ' intuition',
  'sequence': 'Human thinking involves human intuition.'},
 {'score': 0.010527078062295914,
  'token': 604,
  'token_str': ' understanding',
  'sequence': 'Human thinking involves human understanding.'}]