# Importing Libraries

In [None]:
!pip install transformers
!pip install accelerate -U



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import transformers

# Training Tokenizer

In [None]:

from tokenizers import ByteLevelBPETokenizer

path='/content/kant.txt'
tokenizer = ByteLevelBPETokenizer()

tokenizer.train(files=path, vocab_size=52_000, min_frequency=2, special_tokens=["","","","",'"'])

# Saving the Tokenizer

In [None]:
import os

os.makedirs('./Roberta_Transformer',)

tokenizer.save_model('/content/Roberta_Transformer')


['/content/Roberta_Transformer/vocab.json',
 '/content/Roberta_Transformer/merges.txt']

# Loading Saved Tokenizer

In [None]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer = ByteLevelBPETokenizer(
    '/content/Roberta_Transformer/vocab.json',
 '/content/Roberta_Transformer/merges.txt'
)
tokenizer

Tokenizer(vocabulary_size=19281, model=ByteLevelBPE, add_prefix_space=False, lowercase=False, dropout=None, unicode_normalizer=None, continuing_subword_prefix=None, end_of_word_suffix=None, trim_offsets=False)

In [None]:
tokenizer.encode('Hello World').tokens

['H', 'ell', 'o', 'ĠWorld']

In [None]:
tokenizer.encode('Hello World')

Encoding(num_tokens=4, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [None]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("", tokenizer.token_to_id("")),
    ("", tokenizer.token_to_id("")),
)
tokenizer.enable_truncation(max_length=512)

In [None]:
!nvidia-smi

Tue Aug 15 21:06:17 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   64C    P8    12W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Configuration of the Model

In [None]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

print(config)

RobertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.31.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}



In [None]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained("/content/Roberta_Transformer", max_length=512)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizer'.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)
print(model)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

# Building Datasets

In [None]:
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="/content/kant.txt",
    block_size=128,
)



CPU times: user 27.9 s, sys: 347 ms, total: 28.3 s
Wall time: 38.3 s


In [None]:
dataset.examples[:5]

[{'input_ids': tensor([19281,   798,  1185,  1211,  9025,   266,   486,  2222,   266,  1379,
           1265,    12,   378,  4591,  4036, 19282])},
 {'input_ids': tensor([19281,  1556,  1992,   296,   343,   263,   785,   266,  3675,  4568,
            431,   506,  3866,   300,   356, 19282])},
 {'input_ids': tensor([19281,  7865,   506,  5984,  4320,    14,   221,  2778,   505,  1883,
            301,    12,  1155,   301,  2382,   367, 19282])},
 {'input_ids': tensor([19281,   264,    13,   488,   301,   462,   263,  2167,   266,   263,
           1185,  1211,  3902,  3620, 19282])},
 {'input_ids': tensor([19281,   871,   339,  1992,   367,  5365,   431,  5687,    14,  3587,
             14,  7048, 19282])}]

In [None]:
tokenizer.decode(dataset.examples[555]['input_ids'])

'<s> the teachings of experience. It deals with mere conceptions—not, like </s>'

# Data Collator

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/Roberta_Transformer",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
)



trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
%%time
trainer.train()



Step,Training Loss
500,6.5781
1000,5.682
1500,5.1434
2000,4.759
2500,4.5178
3000,4.3341
3500,4.2229
4000,4.1122
4500,4.003
5000,3.905


CPU times: user 34min 44s, sys: 6.53 s, total: 34min 50s
Wall time: 35min 16s


TrainOutput(global_step=9950, training_loss=4.192118882989164, metrics={'train_runtime': 2115.7783, 'train_samples_per_second': 300.854, 'train_steps_per_second': 4.703, 'total_flos': 3252907439698176.0, 'train_loss': 4.192118882989164, 'epoch': 5.0})

# Saving Model

In [None]:
trainer.save_model("/content/Roberta_Transformer")


In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="/content/Roberta_Transformer",
    tokenizer="/content/Roberta_Transformer"
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
fill_mask('Friedrich Nietzsche is known for his concept of the <mask> of God.')

[{'score': 0.041106972843408585,
  'token': 450,
  'token_str': ' law',
  'sequence': 'Friedrich Nietzsche is known for his concept of the law of God.'},
 {'score': 0.021305089816451073,
  'token': 733,
  'token_str': ' idea',
  'sequence': 'Friedrich Nietzsche is known for his concept of the idea of God.'},
 {'score': 0.017676010727882385,
  'token': 12,
  'token_str': ',',
  'sequence': 'Friedrich Nietzsche is known for his concept of the, of God.'},
 {'score': 0.016099732369184494,
  'token': 636,
  'token_str': ' world',
  'sequence': 'Friedrich Nietzsche is known for his concept of the world of God.'},
 {'score': 0.010970601812005043,
  'token': 504,
  'token_str': ' will',
  'sequence': 'Friedrich Nietzsche is known for his concept of the will of God.'}]