# Text Generation using GPT (Using Huggingface)

## Project Setup

In [1]:
pip show transformers

Name: transformers
Version: 4.27.4
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: c:\users\hrith.desktop-75k32p0\appdata\local\packages\pythonsoftwarefoundation.python.3.10_qbz5n2kfra8p0\localcache\local-packages\python310\site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, tokenizers, tqdm
Required-by: aitextgen
Note: you may need to restart the kernel to use updated packages.


In [3]:
import torch
import shutil
from torch.utils.data import Dataset, random_split
from transformers import Trainer, TrainingArguments, GPTNeoForCausalLM, GPT2Tokenizer

## Data Preparation

In [4]:
# Read the text file and returns list of lines in text
def read_file(file_path):
    with open(file_path) as f:
        lines = [line for line in f]
        # lines.remove("")
    return lines


In [5]:
file_path = "THE_SONNETS.txt"

texts = read_file(file_path)
sonnets = []
sonnet = []
for text in texts:
  if len(text)>1:
    sonnet.append(text)
  else:
    sonnets.append(''.join(sonnet))
    sonnet = []

# Remove unnecessary texts


In [6]:
# Prepare sonnets
datas = sonnets[2:-1]
print(len(datas))
for data in datas:
  if len(data)<1:
    datas.remove(data)
print(len(datas))

289
212


In [7]:
# Custome dataset class to load dataset
class ShakespeareDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            # Encode the descriptions using the GPT-Neo tokenizer
            encodings_dict = tokenizer('<|startoftext|>' 
                                        + txt +    
                                        '<|endoftext|>',
                                        truncation=True,
                                        max_length=max_length, 
                                            padding="max_length")
            input_ids = torch.tensor(encodings_dict['input_ids'])    
            self.input_ids.append(input_ids)
            mask = torch.tensor(encodings_dict['attention_mask'])
            self.attn_masks.append(mask)
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

## Initialize tokenizer, model

In [16]:
torch.cuda.is_available()

False

In [17]:
# Set the random seed to a fixed value to get reproducible results 
torch.manual_seed(42)

# Download the pre-trained GPT-Neo model's tokenizer
# Add the custom tokens denoting the beginning and the end 
# of the sequence and a special token for padding
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-125M",    
                            bos_token='<|startoftext|>',
                            eos_token='<|endoftext|>',
                            pad_token='<|pad|>')

# Download the pre-trained GPT-Neo model and transfer it to the GPU
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M")

# Resize the token embeddings because we've just added 3 new tokens 
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50259, 768)

## Train/Test Split data

In [10]:
max_length = max([len(tokenizer.encode(sonnet)) for sonnet in datas])

# Load dataset
dataset = ShakespeareDataset(datas, tokenizer, max_length)

# Split data into train/val


In [11]:
len(dataset)

212

In [12]:
train_size = int(0.9 * len(dataset))

train_data, val_data = random_split(dataset, [train_size, len(dataset) - train_size])

max_length

351

## Train Model

In [13]:
# Here I will pass the output directory where 
# the model predictions and checkpoints will be stored, 
# batch sizes for the training and validation steps, 
# and warmup_steps to gradually increase the learning rate
'''
learning_rates = [5e-5, 3e-5, 1e-5]

for learning_rate in learning_rates:

    training_args = TrainingArguments(output_dir=f'./results_{learning_rate}',
                                      num_train_epochs=5,
                                      logging_steps=1000,
                                      save_steps=1000,
                                      evaluation_strategy='steps',
                                      eval_steps=1000,                               
                                      per_device_train_batch_size=2,
                                      per_device_eval_batch_size=2,
                                      warmup_steps=100,
                                      learning_rate=learning_rate,
                                      weight_decay=0.01,  
                                      logging_dir=f'./logs_{learning_rate}')

    trainer = Trainer(model=model, args=training_args,  
                      train_dataset=train_data,
                      eval_dataset=val_data, 
                      # This custom collate function is necessary 
                      # to built batches of data
                      data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),       
                  'attention_mask': torch.stack([f[1] for f in data]),
                  'labels': torch.stack([f[0] for f in data])})

    # Start training process!
    print(f"Training result for learning rate: {learning_rate}")
    trainer.train()
    print("\n\n")

'''

'\nlearning_rates = [5e-5, 3e-5, 1e-5]\n\nfor learning_rate in learning_rates:\n\n    training_args = TrainingArguments(output_dir=f\'./results_{learning_rate}\',\n                                      num_train_epochs=5,\n                                      logging_steps=1000,\n                                      save_steps=1000,\n                                      evaluation_strategy=\'steps\',\n                                      eval_steps=1000,                               \n                                      per_device_train_batch_size=2,\n                                      per_device_eval_batch_size=2,\n                                      warmup_steps=100,\n                                      learning_rate=learning_rate,\n                                      weight_decay=0.01,  \n                                      logging_dir=f\'./logs_{learning_rate}\')\n\n    trainer = Trainer(model=model, args=training_args,  \n                      train_dataset=train

BAsed on the results above, it looks like model trained with learning rate = 5e-5 is more promising than others.

In [18]:
'''ChildProcessError
training_args = TrainingArguments(output_dir=f'./results',
                                      num_train_epochs=5,
                                      logging_steps=1000,
                                      save_steps=5000,
                                      evaluation_strategy='steps',
                                      eval_steps=1000,                               
                                      per_device_train_batch_size=2,
                                      per_device_eval_batch_size=2,
                                      warmup_steps=100,
                                      learning_rate=5e-5,
                                      weight_decay=0.01,  
                                      logging_dir=f'./logs')

trainer = Trainer(model=model, args=training_args,  
                  train_dataset=train_data,
                  eval_dataset=val_data, 
                  # This custom collate function is necessary 
                  # to built batches of data
                  data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),       
              'attention_mask': torch.stack([f[1] for f in data]),
              'labels': torch.stack([f[0] for f in data])})

# Start training process!
trainer.train()
'''

  3%|▎         | 12/475 [01:20<57:37,  7.47s/it]

KeyboardInterrupt: 

In [None]:
# Save model in the specified file path
trainer.save_model("drive/MyDrive/AICamp/nlp/")

In [None]:
tokenizer.save_pretrained("drive/MyDrive/AICamp/nlp/")

('drive/MyDrive/AICamp/nlp/tokenizer_config.json',
 'drive/MyDrive/AICamp/nlp/special_tokens_map.json',
 'drive/MyDrive/AICamp/nlp/vocab.json',
 'drive/MyDrive/AICamp/nlp/merges.txt',
 'drive/MyDrive/AICamp/nlp/added_tokens.json')

In [19]:
from transformers import GPTNeoForCausalLM

# Set the path to the directory where your trained model is saved
model_dir = 'AICamp/nlp'

# Load the model from the saved directory
model = GPTNeoForCausalLM.from_pretrained(model_dir)

## Checking Model Output

In [23]:
prompt="Be as thy presence is gracious and kind"
input_ids = tokenizer.encode(prompt, add_special_tokens=True, return_tensors="pt")

# set attention mask and pad token id


In [26]:
input_ids

tensor([[ 3856,   355, 11906,  4931,   318, 43210,   290,  1611]])

In [25]:
attention_mask

tensor([[1, 1, 1, 1, 1, 1, 1, 1]])

In [28]:
generated

tensor([[ 3856,   355, 11906,  4931,   318, 43210,   290,  1611]])

In [30]:
generated = tokenizer("Be as thy presence is gracious and kind", return_tensors="pt").input_ids
attention_mask = torch.ones(generated.shape, dtype=torch.long, device=input_ids.device)
pad_token_id = tokenizer.pad_token_id
sample_outputs = model.generate(generated, attention_mask=attention_mask, 
                            pad_token_id=pad_token_id,do_sample=True, top_k=50,
                                # bos_token='<|startoftext|>',
                                # eos_token='<|endoftext|>', pad_token='<|pad|>',
                                max_length=100, top_p=0.95, temperature=1.9, num_return_sequences=10)
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))


0: Be as thy presence is gracious and kind
As thou departest
1: Be as thy presence is gracious and kindelained asTh'enabled me to speak for thou lov', write now 'if I was bold to advocate this extreme deesoftyHurt of thyurt' and my tongue being shown straight, thymilk my finger-crete bud be a kliric, lend lark a glad brow or two, my shadow lease the full view of the world, shake forth the living lease, twixt thy tender fingers 'a happylovewre
2: Be as thy presence is gracious and kind hearted,
Thou patiencesuffestiji, before whom array  
surformed
3: Be as thy presence is gracious and kindMy praise of them,
Delicate pencils withfix  
To mark dates and heraldrybeers,
They pay no pen nor argument no heartNor think norowd,
That feature, date, birth, blood, let them remain anonymous,
But let every fair man's thought give credit;  
That soil from thy praise as the sun hath gone straight,
Now comes of all thy skill their son,
Some bold
4: Be as thy presence is gracious and kind,
Desiring mor

## Upload model to huggingface

In [32]:
pip install ipywidgets

Collecting ipywidgets
  Downloading ipywidgets-8.0.6-py3-none-any.whl (138 kB)
     ---------------------------------------- 0.0/138.3 kB ? eta -:--:--
     -------------------------------------- 138.3/138.3 kB 4.0 MB/s eta 0:00:00
Collecting widgetsnbextension~=4.0.7
  Downloading widgetsnbextension-4.0.7-py3-none-any.whl (2.1 MB)
     ---------------------------------------- 0.0/2.1 MB ? eta -:--:--
     --------------- ------------------------ 0.8/2.1 MB 25.4 MB/s eta 0:00:01
     ---------------------------------------  2.1/2.1 MB 26.9 MB/s eta 0:00:01
     ---------------------------------------- 2.1/2.1 MB 22.6 MB/s eta 0:00:00
Collecting jupyterlab-widgets~=3.0.7
  Downloading jupyterlab_widgets-3.0.7-py3-none-any.whl (198 kB)
     ---------------------------------------- 0.0/198.2 kB ? eta -:--:--
     ------------------------------------- 198.2/198.2 kB 12.5 MB/s eta 0:00:00
Installing collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets
Successfully installe

In [33]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import HfApi

api = HfApi()

In [None]:
# Create your repo first to upload the model
api.create_repo(repo_id="gpt2-sonnet-generators")

RepoUrl('https://huggingface.co/niki-stha/gpt2-sonnet-generators', endpoint='https://huggingface.co', repo_type='model', repo_id='niki-stha/gpt2-sonnet-generators')

In [None]:
# Upload your model to huggingface. You can clone the repo anytime to use the model.
import os

model_pth = "drive/MyDrive/AICamp/nlp/models"

files = os.listdir(model_pth)
for fi in files:
    print(os.path.join(model_pth, fi))

    api.upload_file(
        path_or_fileobj=os.path.join(model_pth, fi),
        path_in_repo=fi,
        repo_id="niki-stha/gpt2-sonnet-generators",
        repo_type="model",
    )

drive/MyDrive/AICamp/nlp/models/config.json
drive/MyDrive/AICamp/nlp/models/generation_config.json
drive/MyDrive/AICamp/nlp/models/pytorch_model.bin


Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/551M [00:00<?, ?B/s]

drive/MyDrive/AICamp/nlp/models/training_args.bin


Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

drive/MyDrive/AICamp/nlp/models/tokenizer_config.json
drive/MyDrive/AICamp/nlp/models/special_tokens_map.json
drive/MyDrive/AICamp/nlp/models/added_tokens.json
drive/MyDrive/AICamp/nlp/models/vocab.json
drive/MyDrive/AICamp/nlp/models/merges.txt
