In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' 

In [None]:
import tensorflow as tf
import torch

# We using the Pytorch version of GPT2.0 pre-trained models from Hugging Face
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

## Load the GPT-2 model

In [None]:
# Load pre-trained GPT-2 model for language modeling
pretrained_model_name = 'gpt2'
config = GPT2Config.from_pretrained(pretrained_model_name)
model = GPT2LMHeadModel.from_pretrained(pretrained_model_name, config=config)
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

## Prepare the dataset

The `TextDataset` class is part of the `transformers` library from Hugging Face and is used to create a dataset for text generation or language modeling tasks. It facilitates the preparation of datasets for training GPT-2 models or similar models. Here are some key parameters and an example of how to use the TextDataset class:

Parameters of `TextDataset`:

`tokenizer`: An instance of a tokenizer from Hugging Face's transformers library, used to tokenize the text data.

`file_path`: The path to the text file containing the dataset.

`block_size`: The maximum block size (sequence length) to use for tokenization. Text will be chunked into blocks of this size.
overwrite_cache: If set to True, the cached dataset will be overwritten.

In [None]:
# Load Shakespeare dataset
file_path = 'shakespeare.txt'
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=file_path,
    block_size=128  # Adjust block_size based on your dataset
)

The `DataCollatorForLanguageModeling` class is part of the `transformers` library from Hugging Face. It is designed to be used as a data collator during training for language modeling tasks. Language modeling tasks involve predicting the next word in a sequence, making it suitable for models like `GPT-2`.

Here are the key aspects and parameters of the `DataCollatorForLanguageModeling` class:

**Parameters**:
`tokenizer`: An instance of a tokenizer from the transformers library. This tokenizer is used to process the inputs and targets.
`mlm`: A boolean indicating whether the model is used for masked language modeling (MLM). If set to True, the data collator will mask tokens for MLM training. If set to False, it assumes the model is training for causal language modeling (no masking).
`mlm_probability`: The probability of masking a token if `mlm=True`.

In [None]:
# Define data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

## Fine tuning the model

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/Datasets_and_models/Fine-Tune_GPT-2_Model_Shakespeare/shakespeare_fine_tuned',
    overwrite_output_dir=True,
    num_train_epochs=300,  # Adjust the number of epochs
    per_device_train_batch_size=4,  # Adjust batch size based on your GPU memory
    save_steps=10_000,
    save_total_limit=2,
)

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained('/fine_tuned_models/')

Step,Training Loss
500,3.443
1000,1.4242
1500,0.3512
2000,0.1241
2500,0.0693
3000,0.0506
3500,0.0399
4000,0.0331
4500,0.0286
5000,0.0266


TrainOutput(global_step=15000, training_loss=0.1976968141555786, metrics={'train_runtime': 2619.1791, 'train_samples_per_second': 22.564, 'train_steps_per_second': 5.727, 'total_flos': 3860589772800000.0, 'train_loss': 0.1976968141555786, 'epoch': 300.0})

## Prediction

In [None]:
# Load the fine-tuned GPT-2 model and tokenizer
fine_tuned_model_path = '/fine_tuned_models'  # Adjust the path accordingly
model = GPT2LMHeadModel.from_pretrained(fine_tuned_model_path)

In [None]:
# Set the model to evaluation mode
model.eval()

# Example prompt
prompt = "To be or not to be, that is the question."

# Tokenize the prompt
# input_ids = tokenizer.encode(prompt, return_tensors="pt").to('cuda') # CUDA version
input_ids = tokenizer.encode(prompt, return_tensors="pt") # CPU version

# Generate text based on the prompt
output = model.generate(input_ids, max_length=100, num_beams=5, no_repeat_ngram_size=2, top_k=50, top_p=0.95)

# Decode and print the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Text:")
print(generated_text)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text:
To be or not to be, that is the question.



How heavy do I journey on the way,
When what I seek (my weary travel's end)
Doth teach that case and that repose to say
'Thus far the miles are measured from thy friend.'
The beast that bears me, tired with my woe, begins to steep
At a height of more than my gentle breast;
And stops just where I am, as when first I saw
