In [None]:
!pip install torch
!pip install tokenizers
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tokenizers
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 29.7 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.13.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 32.3 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 64.2 MB/s 
Installing collected packages: huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 transformers-4.2

In [None]:
vocab_length = 30522

In [None]:
import tokenizers
# 
roman_BWPT = tokenizers.BertWordPieceTokenizer(
#     # add_special_tokens=True, # This argument doesn't work in the latest version of BertWordPieceTokenizer
     unk_token='[UNK]',
     sep_token='[SEP]',
     cls_token='[CLS]',
     clean_text=True,
     handle_chinese_chars=True,
     strip_accents=True,
     lowercase=True,
     wordpieces_prefix='##'
 )
 
roman_BWPT.train(
     files=["roman.txt"],
     vocab_size=vocab_length,
     min_frequency=3,
     limit_alphabet=1000,
     show_progress=True,
     special_tokens=['[PAD]', '[UNK]', '[CLS]', '[MASK]', '[SEP]']
 )
 
roman_BWPT.save_model(".", "roman-urdu"+str(vocab_length))

['./roman-urdu30522-vocab.txt']

In [None]:
# Load the tokenizer
from transformers import BertTokenizer, LineByLineTextDataset

vocab_file_dir = 'vocab.txt' 

tokenizer = BertTokenizer.from_pretrained(vocab_file_dir)

sentence = 'ppp chill pmln lorey lag gayi har tarah'

encoded_input = tokenizer.tokenize(sentence)
print(encoded_input)
# print(encoded_input['input_ids'])

['pp', '##p', 'chill', 'pm', '##ln', 'lore', '##y', 'la', '##g', 'gay', '##i', 'ha', '##r', 'tara', '##h']




In [None]:
dataset= LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = '/content/roman.txt',
    block_size = 128  # maximum sequence length
)

print('No. of lines: ', len(dataset)) # No of lines in your datset



No. of lines:  87883


In [None]:
from transformers import BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling

config = BertConfig(
    vocab_size=50000,
    hidden_size=768, 
    num_hidden_layers=6, 
    num_attention_heads=12,
    max_position_embeddings=512
)
 
model = BertForMaskedLM(config)
print('No of parameters: ', model.num_parameters())


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

No of parameters:  81965648


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='model_dir',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    #prediction_loss_only=False,
)

In [None]:
%%time
trainer.train()
trainer.save_model('model_dir')

***** Running training *****
  Num examples = 87883
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2747


Step,Training Loss
500,7.2596
1000,6.4143
1500,6.1829
2000,6.0617
2500,5.9791




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to model_dir
Configuration saved in model_dir/config.json
Model weights saved in model_dir/pytorch_model.bin


CPU times: user 17min 45s, sys: 4.16 s, total: 17min 49s
Wall time: 17min 58s


In [None]:
from transformers import pipeline

model = BertForMaskedLM.from_pretrained('model_dir')

fill_mask = pipeline(
    "fill-mask",
    model=model,
    tokenizer=tokenizer
)

loading configuration file model_dir/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.23.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 50000
}

loading weights file model_dir/pytorch_model.bin
All model checkpoint weights were used when initializing BertForMaskedLM.

All the weights of BertForMaskedLM were initialized from the model checkpoint at model_dir.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForMaskedLM for pr

In [None]:
fill_mask(' [MASK] ka shukar pti won')

[{'score': 0.031184498220682144,
  'token': 16455,
  'token_str': 'a l l a h',
  'sequence': 'allah ka shukar pti won'},
 {'score': 0.019820570945739746,
  'token': 9706,
  'token_str': 'a p',
  'sequence': 'ap ka shukar pti won'},
 {'score': 0.019775265827775,
  'token': 1038,
  'token_str': 'b',
  'sequence': 'b ka shukar pti won'},
 {'score': 0.018191535025835037,
  'token': 6300,
  'token_str': 'y e',
  'sequence': 'ye ka shukar pti won'},
 {'score': 0.01712663099169731,
  'token': 2909,
  'token_str': 's i r',
  'sequence': 'sir ka shukar pti won'}]

In [None]:
#

In [None]:
#

***** Running training *****
  Num examples = 87883
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2747


Step,Training Loss


Step,Training Loss


KeyboardInterrupt: ignored