#Introduction

In [None]:
!pip install torch
!pip install tokenizers
!pip install transformers



In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
path = '/content/drive/MyDrive/CSE676DeepLearning/raw_bangla_for_BERT.txt'

In [None]:
from tokenizers import BertWordPieceTokenizer

# initialize
tokenizer = BertWordPieceTokenizer()

# and train
tokenizer.train(files=path, 
                vocab_size=50_000, 
                min_frequency=2,
                limit_alphabet=1000, 
                wordpieces_prefix='##',
                special_tokens=[
                    '[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'])






In [None]:
# Train a tokenizer
import tokenizers
 
bertToken= tokenizers.BertWordPieceTokenizer()
 
path = "/content/drive/MyDrive/CSE676DeepLearning/raw_bangla_for_BERT.txt"

bertToken.train(
    files=path,
    vocab_size=50000,
    min_frequency=3,
    limit_alphabet=1000
)






In [None]:
bertToken.save_model('/content')

['/home/shruti/vocab.txt']

In [None]:
# Load the tokenizer
from transformers import BertTokenizer, LineByLineTextDataset

file_dir = '/content/drive/MyDrive/CSE676DeepLearning/vocab.txt

tokenizer = BertTokenizer.from_pretrained(file_dir)

sentence = 'শেষ দিকে সেনাবাহিনীর সদস্যরা এসব ঘর তাঁর প্রশাসনের কাছে হস্তান্তর করেন'

enc_input = tokenizer.tokenize(sentence)
print(enc_input)

['শেষ', 'দিকে', 'সেনাবাহিনীর', 'সদসযরা', 'এসব', 'ঘর', 'তার', 'পরশাসনের', 'কাছে', 'হসতানতর', 'করেন']




In [None]:
%time

dataset = LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = '/content/drive/MyDrive/CSE676DeepLearning/raw_bangla_for_BERT.txt',
    block_size = 128  # maximum sequence length
)

print('No. of lines: ', len(dataset)) # No of lines in your datset

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 10.7 µs




No. of lines:  2172033


In [None]:
from transformers import BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling

config = BertConfig(
    vocab_size=50000,
    hidden_size=768, 
    num_hidden_layers=6, 
    num_attention_heads=12,
    max_position_embeddings=512
)
 
model = BertForMaskedLM(config)

print('No of parameters: ', model.num_parameters())


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

No of parameters:  81965648


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/CSE676DeepLearning/',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset  
)

In [None]:
%%time
trainer.train()

***** Running training *****
  Num examples = 2172033
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 67877
  Number of trainable parameters = 81965648


Step,Training Loss
500,9.2641
1000,8.7892
1500,8.4933
2000,8.3559
2500,8.1902
3000,8.0838
3500,7.9815
4000,7.9315
4500,7.8638
5000,7.7497


Saving model checkpoint to /home/shruti/checkpoint-10000
Configuration saved in /home/shruti/checkpoint-10000/config.json
Model weights saved in /home/shruti/checkpoint-10000/pytorch_model.bin
Saving model checkpoint to /home/shruti/checkpoint-20000
Configuration saved in /home/shruti/checkpoint-20000/config.json
Model weights saved in /home/shruti/checkpoint-20000/pytorch_model.bin
Deleting older checkpoint [/home/shruti/checkpoint-60000] due to args.save_total_limit
Saving model checkpoint to /home/shruti/checkpoint-30000
Configuration saved in /home/shruti/checkpoint-30000/config.json
Model weights saved in /home/shruti/checkpoint-30000/pytorch_model.bin
Deleting older checkpoint [/home/shruti/checkpoint-10000] due to args.save_total_limit
Saving model checkpoint to /home/shruti/checkpoint-40000
Configuration saved in /home/shruti/checkpoint-40000/config.json
Model weights saved in /home/shruti/checkpoint-40000/pytorch_model.bin
Deleting older checkpoint [/home/shruti/checkpoint-200

CPU times: user 1h 3min 29s, sys: 5.3 s, total: 1h 3min 34s
Wall time: 1h 3min 38s


TrainOutput(global_step=67877, training_loss=5.9554953764599015, metrics={'train_runtime': 3818.6179, 'train_samples_per_second': 568.801, 'train_steps_per_second': 17.775, 'total_flos': 2.495318058611789e+16, 'train_loss': 5.9554953764599015, 'epoch': 1.0})

In [None]:
trainer.save_model('/home/shruti/')

Saving model checkpoint to /home/shruti/
Configuration saved in /home/shruti/config.json
Model weights saved in /home/shruti/pytorch_model.bin


In [None]:
from transformers import pipeline

model = BertForMaskedLM.from_pretrained('/home/shruti/')

fill_mask = pipeline(
    "fill-mask",
    model=model,
    tokenizer=tokenizer
)

loading configuration file /home/shruti/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 50000
}

loading weights file /home/shruti/pytorch_model.bin
All model checkpoint weights were used when initializing BertForMaskedLM.

All the weights of BertForMaskedLM were initialized from the model checkpoint at /home/shruti/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForMaske

In [None]:
fill_mask('লাশ উদ্ধার করে ময়নাতদন্তের জন্য কক্সবাজার [MASK] মর্গে পাঠিয়েছে পুলিশ')

[{'score': 0.277895450592041,
  'token': 3119,
  'token_str': 'হাসপাতাল',
  'sequence': 'লাশ উদধার করে মযনাতদনতের জনয ককসবাজার হাসপাতাল মরগে পাঠিযেছে পলিশ'},
 {'score': 0.20703396201133728,
  'token': 1408,
  'token_str': 'সদর',
  'sequence': 'লাশ উদধার করে মযনাতদনতের জনয ককসবাজার সদর মরগে পাঠিযেছে পলিশ'},
 {'score': 0.19087183475494385,
  'token': 1894,
  'token_str': 'মেডিকেল',
  'sequence': 'লাশ উদধার করে মযনাতদনতের জনয ককসবাজার মেডিকেল মরগে পাঠিযেছে পলিশ'},
 {'score': 0.054660357534885406,
  'token': 3740,
  'token_str': 'হাসপাতালের',
  'sequence': 'লাশ উদধার করে মযনাতদনতের জনয ককসবাজার হাসপাতালের মরগে পাঠিযেছে পলিশ'},
 {'score': 0.03167051821947098,
  'token': 2340,
  'token_str': 'জেনারেল',
  'sequence': 'লাশ উদধার করে মযনাতদনতের জনয ককসবাজার জেনারেল মরগে পাঠিযেছে পলিশ'}]

In [None]:
fill_mask('১৯৭১ সালে বাংলাদেশ ৯ মাস মুক্তিযুদ্ধ করে [MASK] অর্জন করে')

[{'score': 0.08184559643268585,
  'token': 1853,
  'token_str': 'পরসকার',
  'sequence': '১৯৭১ সালে বাংলাদেশ ৯ মাস মকতিযদধ করে পরসকার অরজন করে'},
 {'score': 0.036910369992256165,
  'token': 2587,
  'token_str': 'সবাধীনতা',
  'sequence': '১৯৭১ সালে বাংলাদেশ ৯ মাস মকতিযদধ করে সবাধীনতা অরজন করে'},
 {'score': 0.022689618170261383,
  'token': 787,
  'token_str': 'মকতি',
  'sequence': '১৯৭১ সালে বাংলাদেশ ৯ মাস মকতিযদধ করে মকতি অরজন করে'},
 {'score': 0.02130352146923542,
  'token': 316,
  'token_str': 'বাংলাদেশ',
  'sequence': '১৯৭১ সালে বাংলাদেশ ৯ মাস মকতিযদধ করে বাংলাদেশ অরজন করে'},
 {'score': 0.019313856959342957,
  'token': 303,
  'token_str': 'দেশ',
  'sequence': '১৯৭১ সালে বাংলাদেশ ৯ মাস মকতিযদধ করে দেশ অরজন করে'}]

#ALBERT

In [None]:
!pip install sentencepiece



In [None]:
#from sentencepiece import sentencepiece_pb2
import sentencepiece as spm
from transformers import AlbertConfig
from transformers import LineByLineTextDataset

In [None]:
#Download sentencepiece model
!wget "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-spiece.model"
!wget "https://raw.githubusercontent.com/google/sentencepiece/master/python/src/sentencepiece/sentencepiece_pb2.py"
!wget https://github.com/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb
!wget "https://huggingface.co/albert-base-v2/raw/main/config.json"

--2022-11-30 22:31:27--  https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-spiece.model
Resolving s3.amazonaws.com (s3.amazonaws.com)... 54.231.137.88, 52.217.192.200, 52.217.205.32, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|54.231.137.88|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 760289 (742K) [binary/octet-stream]
Saving to: ‘albert-base-v2-spiece.model.3’


2022-11-30 22:31:27 (5.88 MB/s) - ‘albert-base-v2-spiece.model.3’ saved [760289/760289]

--2022-11-30 22:31:27--  https://raw.githubusercontent.com/google/sentencepiece/master/python/src/sentencepiece/sentencepiece_pb2.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8128 (7.9K) [text/plain]
Saving to: ‘sentencepiece_pb2.py.3’


2022-11-30 2

In [None]:
spiece_model_path = 'albert-base-v2-spiece.model'
albert_config_path = 'config.json'
albert_model_path = 'albert-base-v2'

In [None]:
sp = spm.SentencePieceProcessor()
sp.load(spiece_model_path)

True

In [None]:
!pip install transformers



In [None]:
#Load and check tokenizer
#import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertForMaskedLM

#tokenizer = AlbertTokenizer()

albert_tokenizer = AlbertTokenizer.from_pretrained(spiece_model_path, do_lower_case=True)
albert_tokenizer.tokenize("Test tokenizer")

['▁test', '▁to', 'ken', 'izer']

In [None]:
sp.encode_as_pieces("Test tokenizer".lower())

['▁test', '▁to', 'ken', 'izer']

In [None]:
class OffsetTokenizer():
    def __init__(self, path_model=spiece_model_path):
        self.spt = sentencepiece_pb2.SentencePieceText()
        self.sp = spm.SentencePieceProcessor()
        self.sp.load(path_model)
        
    def encode(self, text, return_tokens=False, lower=True):
        if lower:
            text = text.lower()
        offset = []
        ids = []
        self.spt.ParseFromString(self.sp.encode_as_serialized_proto(text))
        
        for piece in self.spt.pieces:
            offset.append((piece.begin, piece.end))
            ids.append(piece.id)
            
        if return_tokens:
            return sp.encode_as_pieces(text), ids, offset
        else:
            return ids, offset

In [None]:
#o_tokenizer = OffsetTokenizer()
#o_tokenizer.encode("Test tokenizer", return_tokens=False)

In [None]:
albert_config = AlbertConfig.from_pretrained(albert_config_path)
albert_config.output_hidden_states=True

You are using a model of type bert to instantiate a model of type albert. This is not supported for all configurations of models and can yield errors.


In [None]:
%time

aldataset = LineByLineTextDataset(
    tokenizer = albert_tokenizer,
    file_path = path,
    block_size = 64
)

print('No. of lines: ', len(aldataset)) # No of lines in your datset

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.63 µs
No. of lines:  2172033


In [None]:
from transformers import AlbertConfig, AlbertForMaskedLM, DataCollatorForLanguageModeling

alconfig = AlbertConfig()
 
almodel = AlbertForMaskedLM(alconfig)

print('No of parameters: ', almodel.num_parameters())

aldata_collator = DataCollatorForLanguageModeling(tokenizer=albert_tokenizer, mlm=True, mlm_probability=0.15)

No of parameters:  206368944


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='/home/shruti/',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2
)

altrainer = Trainer(
    model=almodel,
    args=training_args,
    data_collator=aldata_collator,
    train_dataset=aldataset,    
)

In [None]:
%%time
altrainer.train()

***** Running training *****
  Num examples = 2172033
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 67877
  Number of trainable parameters = 206368944


Step,Training Loss
500,5.9987
1000,0.8764


In [None]:
altrainer.save_model('/content/drive/MyDrive/CSE676DeepLearning/')

NameError: name 'altrainer' is not defined

In [None]:
from transformers import pipeline

model = AlbertForMaskedLM.from_pretrained('/content/drive/MyDrive/CSE676DeepLearning/')

fill_mask2 = pipeline(
    "fill-mask",
     model=almodel,
     tokenizer=albert_tokenizer
)

In [None]:
fill_mask2('লাশ উদ্ধার করে ময়নাতদন্তের জন্য কক্সবাজার [MASK] মর্গে পাঠিয়েছে পুলিশ')