# Training GPT-2 model on Bangla books

Necessary imports for the project

In [2]:
import matplotlib.pyplot as plt
import os
from pathlib import Path
import pickle
import random
import tensorflow as tf
from transformers import AutoTokenizer, TFGPT2LMHeadModel
from transformers import WEIGHTS_NAME, CONFIG_NAME
from tqdm import tqdm

## Preparing dataset
Initialize the project root, data, and model directories. `processed_data` directory contains selected books from renowned authors only.

In [3]:
# get data and model directories
project_root = Path('__file__').resolve().parents[1]
data_dir = project_root / 'processed_data/'
model_dir = project_root / 'models/bn_gpt2'
os.makedirs(model_dir, exist_ok=True) # Create if does not exist

Get list of files in the data directory

In [4]:
filenames = [str(file) for file in Path(data_dir).glob('**/*.txt')]

Then we randomly shuffle the filenames list and select 100 files.

In [5]:
random.shuffle(filenames)
filenames = filenames[:100]

Split the files for training and testing. We are keeping 20% for testing and 80% for training.

In [6]:
train_test_split = 0.2
train_size = int(len(filenames) * train_test_split)
training_filenames = filenames[:-train_size]
test_filenames = filenames[-train_size:]

In [7]:
def create_dataset(tokenizer, files, max_sequence_length):
    # create inputs and labels
    inputs = []
    labels = []

    for file in tqdm(files):
        with open(file, 'r') as f:
            for line in f:
                string_tokenized = tokenizer.encode(line)
                # create a list of block size tokens
                examples = []
                BATCH_SIZE = 12
                BUFFER_SIZE = 1000
                for i in range(0, len(string_tokenized) - max_sequence_length + 1, max_sequence_length):
                    ex = string_tokenized[i:i + max_sequence_length]
                    inputs.append(ex[:-1])
                    labels.append(ex[1:])
    dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
    dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
    return dataset

Create a tokenizer from pretrained model and add special tokens for padding, beginning, and end of sentence.

In [8]:
tokenizer = AutoTokenizer.from_pretrained("flax-community/gpt2-bengali")
tokenizer.add_special_tokens(
    {'pad_token': '<pad>', 'bos_token': '<s>', 'eos_token': '</s>'})

3

Define max sequence length

In [9]:
MAX_SEQUENCE_LENGTH = 200

Now we create training dataset `X_train` and `y_train` from training files and test dataset `X_test` and `y_test` from test files.

In [11]:
train_dataset = create_dataset(tokenizer, training_filenames, MAX_SEQUENCE_LENGTH)
test_dataset = create_dataset(tokenizer, test_filenames, MAX_SEQUENCE_LENGTH)

100%|███████████████████████████████████████████| 80/80 [00:33<00:00,  2.40it/s]
100%|███████████████████████████████████████████| 20/20 [00:05<00:00,  3.83it/s]


## Creating Model

Load pretrained GPT2 model

In [14]:
model = TFGPT2LMHeadModel.from_pretrained('flax-community/gpt2-bengali', from_pt=True)
# create model parameters
adam = tf.keras.optimizers.Adam(learning_rate=0.001)
# definining our loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# defining our metric which we want to observe
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(loss=[loss, *[None] * model.config.n_layer], optimizer=adam, metrics=[metric])

2022-04-01 22:52:39.365952: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFGPT2LMHeadModel: ['transformer.h.3.attn.masked_bias', 'transformer.h.4.attn.masked_bias', 'lm_head.weight', 'transformer.h.0.attn.masked_bias', 'transformer.h.7.attn.masked_bias', 'transformer.h.9.attn.masked_bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.6.attn.masked_bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.5.attn.masked_bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.8.attn.masked_bias', 'transformer.h.11.attn.masked_bias']
- This IS expected if you are initializing TFGPT2LMHeadModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are init

In [15]:
checkpoint = tf.keras.callbacks.ModelCheckpoint(str(model_dir),
                                                monitor='loss',
                                                verbose=1,
                                                save_best_only=True,
                                                mode='min')

In [None]:
history = model.fit(train_dataset, epochs=3, callbacks=[checkpoint])

Epoch 1/3
 119/2563 [>.............................] - ETA: 25:59 - loss: 1.6319 - logits_loss: 1.6319 - logits_accuracy: 0.5983 - past_key_values_1_accuracy: 1.3240e-04 - past_key_values_2_accuracy: 1.1099e-04 - past_key_values_3_accuracy: 2.0381e-05 - past_key_values_4_accuracy: 2.7419e-04 - past_key_values_5_accuracy: 1.4384e-04 - past_key_values_6_accuracy: 1.7859e-04 - past_key_values_7_accuracy: 1.1231e-04 - past_key_values_8_accuracy: 1.0528e-04 - past_key_values_9_accuracy: 1.0293e-04 - past_key_values_10_accuracy: 3.2976e-04 - past_key_values_11_accuracy: 7.7271e-05 - past_key_values_12_accuracy: 1.1422e-04

In [None]:
with open(str(model_dir / 'history'), 'wb') as file_pi:
    pickle.dump(history, file_pi)

In [None]:
model_to_save = model.module if hasattr(model, 'module') else model
output_model_file = os.path.join(model_dir, WEIGHTS_NAME)
output_config_file = os.path.join(model_dir, CONFIG_NAME)
# save model and model configs
model.save_pretrained(model_dir)
model_to_save.config.to_json_file(output_config_file)
# save tokenizer
tokenizer.save_pretrained(model_dir)