### Train new GPT-2 model.
- based on LABR dataset book reviews .
- later on use this pre-trained model to generate sentences.

In [16]:
import os
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer
import tensorflow as tf
from transformers import GPT2Config, TFGPT2LMHeadModel, GPT2Tokenizer, GPT2LMHeadModel
from pathlib import Path
import os

In [17]:
class BPE_token(object):
    def __init__(self):
        self.tokenizer = Tokenizer(BPE())
        self.tokenizer.normalizer = Sequence([
            NFKC()
        ])
        self.tokenizer.pre_tokenizer = ByteLevel()
        self.tokenizer.decoder = ByteLevelDecoder()

    def bpe_train(self, paths):
        trainer = BpeTrainer(vocab_size=5_00_00, show_progress=True, inital_alphabet=ByteLevel.alphabet(), special_tokens=[
            "<s>",
            "<pad>",
            "</s>",
            "<unk>",
            "<mask>"
        ])
        self.tokenizer.train(trainer, paths)

    def save_tokenizer(self, location, prefix=None):
        if not os.path.exists(location):
            os.makedirs(location)
        self.tokenizer.model.save(location, prefix)

In [18]:

# the folder 'text' contains all the files
paths = [str(x) for x in Path("./data/").glob("**/negative_labr.txt")]
tokenizer = BPE_token()
# train the tokenizer model
tokenizer.bpe_train(paths)
# saving the tokenized data in our specified folder 
save_path = 'tokenized_data'
tokenizer.save_tokenizer(save_path)

In [19]:
paths

['data/negative_labr.txt']

In [20]:
# loading tokenizer from the saved model path
tokenizer = GPT2Tokenizer.from_pretrained(save_path)
tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})
# creating the configurations from which the model can be made
config = GPT2Config(
  vocab_size=tokenizer.vocab_size,
  bos_token_id=tokenizer.bos_token_id,
  eos_token_id=tokenizer.eos_token_id
  
)
# creating the model
model = TFGPT2LMHeadModel(config)

In [21]:
single_string = ''
for filename in paths:
    with open(filename, "r", encoding='utf-8') as f:
        x = f.read()
    single_string += x + tokenizer.eos_token
string_tokenized = tokenizer.encode(single_string)

In [22]:
examples = []
block_size = 100
BATCH_SIZE = 12
BUFFER_SIZE = 1000
for i in range(0, len(string_tokenized) - block_size + 1, block_size):
    examples.append(string_tokenized[i:i + block_size])
inputs, labels = [], []
for ex in examples:
    inputs.append(ex[:-1])
    labels.append(ex[1:])

dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)


In [23]:
# defining our optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
# definining our loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# defining our metric which we want to observe
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
# compiling the model
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])

### Train Language Model

In [24]:
# Check that we have a GPU
!nvidia-smi

Wed Apr  7 18:17:34 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.102.04   Driver Version: 450.102.04   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 207...  Off  | 00000000:01:00.0 Off |                  N/A |
| N/A   56C    P0    27W /  N/A |   7704MiB /  7973MiB |      1%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [25]:
import torch
torch.cuda.is_available()

True

In [26]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [27]:
num_epoch = 50
history = model.fit(dataset, epochs=num_epoch)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50


Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [28]:
text = " كتاب وحش"
# encoding the input text
input_ids = tokenizer.encode(text, return_tensors='tf')
# getting out output
beam_output = model.generate(
    input_ids,
    do_sample=True, 
    max_length=50, 
    min_length=10,
    top_k=40,
    num_return_sequences=5  
)

Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence


In [29]:
for sentence in beam_output:
    print(tokenizer.decode(sentence))
    print('============================')

 كتاب وحش. بجد :)"
"بعض المقالات تحمل رسائل صغيرة.ـيدة غريبة رومانسية تخنق للأبل هل ستجنبها الج نوبل على التلفاز لكني لم تستخذ من جسد الزمن. حاولت أنى أقرأ ماذا تمر بهذا الشكل لأني لم تكن بالقدر او فلسفة شديدة الا
 كتاب وحش. لكن بطريقة عام لم تكن بالمستوى المطلوب! كان مجموعة قصص قصيرة مفيدة. كُقل من الصحة كأن الكتابة قبل قبل القروليتي على العموم طبعاً جدًا مما يجعلها الأخطاء الكبيرة التي لا أستطيع أن أُقرأ في الحياة."
 كتاب وحش. جزء كبير ملوش اي شي جديد."
"قرأته في جلسة واحدة و لن تخرج منه لكن أحداثها ذكي نظرا بالغثيان في انتظار الصحف بتاعها لم استفد منه"
"في معظم الحالات العربيه ثم أتممت قراءة الكتاب."
"لم أجد فيها
 كتاب وحش.بس بجد :)"
"الكتاب يعيد قصص قصيرة ولكن معظمه قائم على الانترنت. قرأته فعلاً. لم يرق لي متعة القراءة لاحقا ل"
"اول مرة اكمله"
"بعض المقالات يتحدث عن مجموعة مقالات احمد حلمى بيحاول يبقى في طفولتي و
 كتاب وحش. لكن واقع تاريخ نصائح جديدة متشابهة للغاية"
"الكتاب صغير جداً, تحس مجهود واضح, بناء على عدد صفحات عالية, مجموعة مقالات ساخرة ربما كان السبب اللي يستحق القراءه م

### saving the model

In [30]:
from transformers import WEIGHTS_NAME, CONFIG_NAME
import os
output_dir = './gpt-2-negative-reviews/'
# creating directory if it is not present
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
model_to_save = model.module if hasattr(model, 'module') else model
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)
# save model and model configs
model.save_pretrained(output_dir)
model_to_save.config.to_json_file(output_config_file)
# save tokenizer
tokenizer.save_pretrained(output_dir)

('./gpt-2-negative-reviews/tokenizer_config.json',
 './gpt-2-negative-reviews/special_tokens_map.json',
 './gpt-2-negative-reviews/vocab.json',
 './gpt-2-negative-reviews/merges.txt',
 './gpt-2-negative-reviews/added_tokens.json')

### load the model

In [31]:
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
model = TFGPT2LMHeadModel.from_pretrained(output_dir)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at ./gpt-2-negative-reviews/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


### Adapt the model to work with pytorch also

In [32]:
pt_model = GPT2LMHeadModel.from_pretrained(output_dir, from_tf=True)
pt_model.save_pretrained(output_dir)

All TF 2.0 model weights were used when initializing GPT2LMHeadModel.

Some weights of GPT2LMHeadModel were not initialized from the TF 2.0 model and are newly initialized: ['transformer.h.0.attn.bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.1.attn.bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.2.attn.bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.3.attn.bias', 'transformer.h.3.attn.masked_bias', 'transformer.h.4.attn.bias', 'transformer.h.4.attn.masked_bias', 'transformer.h.5.attn.bias', 'transformer.h.5.attn.masked_bias', 'transformer.h.6.attn.bias', 'transformer.h.6.attn.masked_bias', 'transformer.h.7.attn.bias', 'transformer.h.7.attn.masked_bias', 'transformer.h.8.attn.bias', 'transformer.h.8.attn.masked_bias', 'transformer.h.9.attn.bias', 'transformer.h.9.attn.masked_bias', 'transformer.h.10.attn.bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.11.attn.bias', 'transformer.h.11.attn.masked_bias', 'lm_head.weight']
You should probably TRAI

In [33]:
from transformers import AutoTokenizer, AutoModel

In [34]:
AutoTokenizer.from_pretrained("gpt-2-negative-reviews")
AutoModel.from_pretrained("gpt-2-negative-reviews")

GPT2Model(
  (wte): Embedding(50000, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0): Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (1): Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): MLP(
        (c_fc): Conv1D

In [28]:
history

<tensorflow.python.keras.callbacks.History at 0x7f853eeac1f0>