## FINETUNE GPT-02 on Shakespear Dataset

#### Install necessary packages

In [2]:
!pip install transformers
!pip install datasets

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 23.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable
Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
     -------------------------------------- 521.2/521.2 kB 1.2 MB/s eta 0:00:00


[notice] A new release of pip is available: 23.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip



Collecting pyarrow>=8.0.0 (from datasets)
  Downloading pyarrow-14.0.1-cp38-cp38-win_amd64.whl (24.6 MB)
     ---------------------------------------- 24.6/24.6 MB 1.2 MB/s eta 0:00:00
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
     -------------------------------------- 115.3/115.3 kB 1.7 MB/s eta 0:00:00
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp38-cp38-win_amd64.whl (29 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py38-none-any.whl (132 kB)
     -------------------------------------- 132.6/132.6 kB 1.9 MB/s eta 0:00:00
Collecting fsspec[http]<=2023.10.0,>=2023.1.0 (from datasets)
  Downloading fsspec-2023.10.0-py3-none-any.whl (166 kB)
     -------------------------------------- 166.4/166.4 kB 2.0 MB/s eta 0:00:00
Collecting huggingface-hub>=0.18.0 (from datasets)
  Downloadi

#### Download shakespeare dataset / upload already split dataset on colab

In [1]:
#import gpt_2_simple as gpt2
import os
import requests

file_name = "shakespeare.txt"
if not os.path.isfile(file_name):
	url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
	data = requests.get(url)

	with open(file_name, 'w') as f:
		f.write(data.text)


#### Fine-Tune and Train GPT-02

In [2]:
import torch
from transformers import GPT2LMHeadModel, GPT2Config, GPT2Tokenizer, GPT2LMHeadModel
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

# Load the GPT-2 model and tokenizer
model_name = "gpt2"
config = GPT2Config.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Define the dataset for training
train_file = "./shakespeare_train_data.txt"
test_file = "./shakespeare_test_data.txt"

def load_dataset(train_file, test_file, tokenizer):
    train_dataset = TextDataset(tokenizer=tokenizer, file_path=train_file, block_size=128)
    test_dataset = TextDataset(tokenizer=tokenizer, file_path=test_file, block_size=128)
    return train_dataset, test_dataset

train_dataset, test_dataset = load_dataset(train_file, test_file, tokenizer)

# Define the data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Set up the training arguments
training_args = TrainingArguments(
    output_dir="output",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy="epoch",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

  from .autonotebook import tqdm as notebook_tqdm
***** Running training *****
  Num examples = 2399
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 900
  Number of trainable parameters = 124439808
 33%|███▎      | 300/900 [26:47<54:10,  5.42s/it]  ***** Running Evaluation *****
  Num examples = 241
  Batch size = 8
                                                 
 33%|███▎      | 300/900 [27:39<54:10,  5.42s/it]

{'eval_loss': 3.4811930656433105, 'eval_runtime': 51.673, 'eval_samples_per_second': 4.664, 'eval_steps_per_second': 0.6, 'epoch': 1.0}


 56%|█████▌    | 500/900 [46:00<36:45,  5.51s/it]  

{'loss': 3.6267, 'learning_rate': 2.2222222222222223e-05, 'epoch': 1.67}


 67%|██████▋   | 600/900 [55:19<27:29,  5.50s/it]***** Running Evaluation *****
  Num examples = 241
  Batch size = 8
                                                 
 67%|██████▋   | 600/900 [56:12<27:29,  5.50s/it]

{'eval_loss': 3.450711965560913, 'eval_runtime': 52.5294, 'eval_samples_per_second': 4.588, 'eval_steps_per_second': 0.59, 'epoch': 2.0}


100%|██████████| 900/900 [1:22:08<00:00,  4.73s/it]***** Running Evaluation *****
  Num examples = 241
  Batch size = 8
                                                   
100%|██████████| 900/900 [1:22:55<00:00,  4.73s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 900/900 [1:22:55<00:00,  5.53s/it]

{'eval_loss': 3.4513208866119385, 'eval_runtime': 46.6431, 'eval_samples_per_second': 5.167, 'eval_steps_per_second': 0.665, 'epoch': 3.0}
{'train_runtime': 4975.7633, 'train_samples_per_second': 1.446, 'train_steps_per_second': 0.181, 'train_loss': 3.5112934027777776, 'epoch': 3.0}





TrainOutput(global_step=900, training_loss=3.5112934027777776, metrics={'train_runtime': 4975.7633, 'train_samples_per_second': 1.446, 'train_steps_per_second': 0.181, 'train_loss': 3.5112934027777776, 'epoch': 3.0})

#### Save the model as well as te tokenizer

In [3]:
trainer.save_model("fine_tuned_gpt2_shakespeare")

tokenizer.save_pretrained("fine_tuned_gpt2_shakespeare")

Saving model checkpoint to fine_tuned_gpt2_shakespeare
Configuration saved in fine_tuned_gpt2_shakespeare\config.json
Configuration saved in fine_tuned_gpt2_shakespeare\generation_config.json
Model weights saved in fine_tuned_gpt2_shakespeare\pytorch_model.bin
tokenizer config file saved in fine_tuned_gpt2_shakespeare\tokenizer_config.json
Special tokens file saved in fine_tuned_gpt2_shakespeare\special_tokens_map.json


('fine_tuned_gpt2_shakespeare\\tokenizer_config.json',
 'fine_tuned_gpt2_shakespeare\\special_tokens_map.json',
 'fine_tuned_gpt2_shakespeare\\vocab.json',
 'fine_tuned_gpt2_shakespeare\\merges.txt',
 'fine_tuned_gpt2_shakespeare\\added_tokens.json')