## **Creation of a masked language model for Hindi**

Date: 01-Dec-2021 \
Author: Vivek Jayaswal

In [None]:
!pip install transformers

In [None]:
!pip install datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Change the present working directory to the parent directory for WikiText
import os
os.chdir('/content/drive/MyDrive/Data/')
!ls

### **Section 1: Generate byte-level BPE tokens for a corpus**
**Section 1.1: Train the tokenizer**

In [None]:
import tokenizers
from tokenizers import Tokenizer
from tokenizers.models import BPE

tokenizer = Tokenizer(BPE(unk_token="<unk>")) #BytePair Encoding

In [None]:
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.BertPreTokenizer()
# tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel() # Byte-level BPE

In [None]:
tokenizer.enable_truncation(512)
tokenizer.normalizer = tokenizers.normalizers.BertNormalizer()

In [None]:
from tokenizers.trainers import BpeTrainer
# trainer = BpeTrainer(special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

In [None]:
files = ["./Hindi_Input/Mstr_Hindi_Rev.txt"]

In [None]:
# Execution time: 55s (for Hindi data)
tokenizer.train(files, trainer)

In [None]:
# tokenizer.save("./Hindi_Output/tokenizer.json")

**Section 1.2: Load and test the tokenizer**


In [None]:
tokenizer_load = Tokenizer.from_file("./Hindi_Output/tokenizer.json")

In [None]:
# Use of RoBERTa's special tokens at beginning and end of sentence
tokenizer_load.post_processor = tokenizers.processors.BertProcessing(sep=("[SEP]", tokenizer_load.token_to_id("[SEP]"))
                                                                  , cls=("[CLS]", tokenizer_load.token_to_id("[CLS]")))

In [None]:
output = tokenizer_load.encode("एस एक बांग्ला टीवी चैनल है")
print(output.tokens)

In [None]:
print(output.type_ids)

### **Section 2: Train a masked LM using the tokenizer trained & saved in Section 1**

In [None]:
from transformers import BertConfig

config = BertConfig(
    vocab_size=30000,  # value of 30K was chosen as the tokenizer was trained with a default value of 30K
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=2,
)

In [None]:
# Save config.json
# config.to_json_file('./Hindi_Output/config.json')

In [None]:
# from transformers import RobertaTokenizerFast
from transformers import BertTokenizerFast

In [None]:
tokenizer_new = BertTokenizerFast.from_pretrained("./Hindi_Output")

In [None]:
from transformers import BertForMaskedLM
model = BertForMaskedLM(config=config)

In [None]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer_new, mlm=True, mlm_probability=0.15)

In [None]:
# !mkdir ./shards
!split -a 40 -l 12600 -d "./Hindi_Input/Mstr_Hindi_Rev.txt" ./shards_hindi/shard_

In [None]:
import glob
files = glob.glob('./shards_hindi/*')
# files

In [None]:
from datasets import load_dataset
dataset = load_dataset('text', data_files=files[0], split='train') #Use only one batch of 256000 examples

In [None]:
print(dataset)

In [None]:
def encode(examples):
  return tokenizer_new(examples['text'], truncation=True, padding='max_length', max_length=512)

dataset = dataset.map(encode, batched=True) # Apply the "encode" function to all elements of "dataset" which is passed as "example" variable
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./Hindi_Output",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=8, # lowered the batch size from 64
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
%%time
# trainer.train()

In [None]:
# The following columns in the training set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: text.
# ***** Running training *****
#   Num examples = 12601
#   Num Epochs = 1
#   Instantaneous batch size per device = 8
#   Total train batch size (w. parallel, distributed & accumulation) = 8
#   Gradient Accumulation steps = 1
#   Total optimization steps = 1576

# [1576/1576 28:37, Epoch 1/1]
# Step 	Training Loss
# 500 	7.680700
# 1000 	7.104600
# 1500 	6.969000

# Training completed. Do not forget to share your model on huggingface.co/models =)

# CPU times: user 28min 31s, sys: 5.58 s, total: 28min 36s
# Wall time: 28min 39s

# TrainOutput(global_step=1576, training_loss=7.2410494421944405, metrics={'train_runtime': 1719.0714, 'train_samples_per_second': 7.33, 'train_steps_per_second': 0.917, 'total_flos': 1670382921203712.0, 'train_loss': 7.2410494421944405, 'epoch': 1.0})

In [None]:
# trainer.save_model("./Hindi_Model")

### **Section 3: Load and test the trained masked LM**

In [None]:
from transformers import pipeline

In [None]:
fill_mask = pipeline("fill-mask", model="./Hindi_Model", tokenizer="./Hindi_Model")

In [None]:
# Poor fit owing to the limited size of the training set

fill_mask("एस एक बांग्ला [MASK] चैनल है")

# [{'score': 0.049333274364471436,
#   'sequence': 'एस एक बागला । चनल ह',
#   'token': 375,
#   'token_str': '।'},
#  {'score': 0.046125490218400955,
#   'sequence': 'एस एक बागला ह चनल ह',
#   'token': 363,
#   'token_str': 'ह'},
#  {'score': 0.03156042471528053,
#   'sequence': 'एस एक बागला म चनल ह',
#   'token': 354,
#   'token_str': 'म'},
#  {'score': 0.03022793121635914,
#   'sequence': 'एस एक बागला क चनल ह',
#   'token': 330,
#   'token_str': 'क'},
#  {'score': 0.026870885863900185,
#   'sequence': 'एस एक बागला, चनल ह',
#   'token': 16,
#   'token_str': ','}]

In [None]:
# fill_mask("भारत में प्रचलित कुछ अन्य प्राचीन संवत इस प्रकार है")
fill_mask("भारत में प्रचलित [MASK] अन्य संवत इस प्रकार है")

# [{'score': 0.04326169565320015,
#   'sequence': 'भारत म परचलित ह अनय सवत इस परकार ह',
#   'token': 363,
#   'token_str': 'ह'},
#  {'score': 0.04239127039909363,
#   'sequence': 'भारत म परचलित म अनय सवत इस परकार ह',
#   'token': 354,
#   'token_str': 'म'},
#  {'score': 0.04220021516084671,
#   'sequence': 'भारत म परचलित । अनय सवत इस परकार ह',
#   'token': 375,
#   'token_str': '।'},
#  {'score': 0.038384582847356796,
#   'sequence': 'भारत म परचलित क अनय सवत इस परकार ह',
#   'token': 330,
#   'token_str': 'क'},
#  {'score': 0.02418365515768528,
#   'sequence': 'भारत म परचलित - अनय सवत इस परकार ह',
#   'token': 17,
#   'token_str': '-'}]