# Tokenizers
  encoding string inputs in model input.
  
  Tokenizing (splitting strings in sub-word token strings), converting tokens strings to ids and back, and encoding/decoding (i.e., tokenizing and converting to integers)

In [None]:
!pip install transformers tokenizers

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 14.1 MB/s 
[?25hCollecting tokenizers
  Downloading tokenizers-0.11.5-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 53.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 59.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 7.0 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 70.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing

# New Section

In [None]:

import tensorflow as tf
from gensim.corpora import WikiCorpus
import os
import argparse

# lang = 'bn'

def tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list:
    return [token for token in text.split() if token_min_len <= len(token) <= token_max_len]

In [None]:
import pandas as pd

banglaLyricsCSV = pd.read_csv('/content/BanglaSongLyrics.csv')

banglaLyrics = banglaLyricsCSV['lyrics']

for i, lyrics in enumerate(banglaLyrics):
  file = open('/content/lyrics/' + str(i + 1) + '.txt', 'w')
  for line in lyrics.split('\n'):
    file.write(line)
    file.write("\n")
  file.close()

In [None]:
import os
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer

class BPE_token(object):
    def __init__(self):
        self.tokenizer = Tokenizer(BPE())
        self.tokenizer.normalizer = Sequence([
            NFKC()
        ])
        self.tokenizer.pre_tokenizer = ByteLevel()
        self.tokenizer.decoder = ByteLevelDecoder()

    def bpe_train(self, paths):
        trainer = BpeTrainer(vocab_size=50000, show_progress=True, inital_alphabet=ByteLevel.alphabet(), special_tokens=[
            "<s>",
            "<pad>",
            "</s>",
            "<unk>",
            "<mask>"
        ])
        self.tokenizer.train(paths, trainer)

    def save_tokenizer(self, location, prefix=None):
        if not os.path.exists(location):
            os.makedirs(location)
        self.tokenizer.model.save(location, prefix)




In [None]:
from pathlib import Path
import os

paths = [str(x) for x in Path("/content/lyrics/").glob("**/*.txt")]
tokenizer = BPE_token()
tokenizer.bpe_train(paths)
save_path = 'tokenized_data'
tokenizer.save_tokenizer(save_path)

In [None]:
import tensorflow as tf
from transformers import GPT2Config, TFGPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer(vocab_file = save_path + '/vocab.json', merges_file = save_path + '/merges.txt')
tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})

config = GPT2Config(
  vocab_size=tokenizer.vocab_size,
  bos_token_id=tokenizer.bos_token_id,
  eos_token_id=tokenizer.eos_token_id
)

model = TFGPT2LMHeadModel(config)

In [None]:
single_string = ''
for filename in paths:
  with open(filename, "r", encoding='utf-8') as f:
   x = f.read()
  single_string += x + tokenizer.eos_token
string_tokenized = tokenizer.encode(single_string)

In [None]:
examples = []
block_size = 100
BATCH_SIZE = 12
BUFFER_SIZE = 1000
for i in range(0, len(string_tokenized) - block_size + 1, block_size):
  examples.append(string_tokenized[i:i + block_size])
inputs, labels = [], []
for ex in examples:
  inputs.append(ex[:-1])
  labels.append(ex[1:])
dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [None]:

optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])

In [None]:
num_epoch = 20
history = model.fit(dataset, epochs=num_epoch)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
text = "শুভক্ষণে "

input_ids = tokenizer.encode(text, return_tensors='tf')

beam_output = model.generate(
  input_ids,
  max_length = 100, #previously 50
  num_beams = 5,
  temperature = 0.8,
  no_repeat_ngram_size=2,
  num_return_sequences=5
)

print(tokenizer.decode(beam_output[0]))

Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


শুভক্ষণে  শুধু দূরে থাকো
তুমি যে আমার হৃদয় জুড়ে
স্বপ্নের ভেঙ্গে দিনগুলো সত্যি হলো আজ
দুজনে এই মনে হয়
কি জানি কি ভাবি তাই বুঝি না
জ


In [None]:
#########all_songs########
col_list = ["Starting", "Self -BLEU(GPT-2)","SELF_BLEU(LSTM)"]
df = pd.read_csv("/content/drive/MyDrive/ML Project/result - Sheet1.csv", usecols=col_list)


for j in range(len(df["Starting"])):
    text = df["Starting"][j]


    input_ids = tokenizer.encode(text, return_tensors='tf')

    beam_output = model.generate(
      input_ids,
      max_length = 100, #previously 50
      num_beams = 5,
      temperature = 0.8,
      no_repeat_ngram_size=2,
      num_return_sequences=5
    )

    output_txt = tokenizer.decode(beam_output[0])
    path = "/content/drive/MyDrive/ML Project/GPT-2_outputs/song" + str(j) + ".txt"
    f = open(path, "w")
    f.writelines(output_txt)
    f.close()

Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 2 (first `eos_token_id`) to generate s

In [None]:
from transformers import WEIGHTS_NAME, CONFIG_NAME
import os
output_dir = './model_bn_custom/'

if not os.path.exists(output_dir):
  os.mkdir(output_dir)
model_to_save = model.module if hasattr(model, 'module') else model
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)

model.save_pretrained(output_dir)
model_to_save.config.to_json_file(output_config_file)

tokenizer.save_pretrained(output_dir)


('./model_bn_custom/tokenizer_config.json',
 './model_bn_custom/special_tokens_map.json',
 './model_bn_custom/vocab.json',
 './model_bn_custom/merges.txt',
 './model_bn_custom/added_tokens.json')

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
model = TFGPT2LMHeadModel.from_pretrained(output_dir)


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at ./model_bn_custom/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp -r "/content/model_bn_custom" "/content/drive/MyDrive/ML Project"