## Install Libraries

In [None]:
%pip install transformers
%pip install datasets

## Mount Google Drive

This is done to retrieve data from my personal google drive folders.

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Imports

In [15]:
import transformers
import torch
import numpy as np
import datasets
import pandas as pd

from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import train_test_split


device="cuda" if torch.cuda.is_available() else "cpu"

## Download the tokenizer and the model

I decided to start from a distilled GPT-2 model to reduce both inference and training time. Additionaly, the pad token coincide with the EOS token. This is suboptimal but, GPT-2 does not have any other special token.

In [7]:
# Instantiate a tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.add_special_tokens({'pad_token': '<|endoftext|>'})
print(f"Special tokens: {tokenizer.all_special_tokens}")

# Instantiate a model
model = GPT2LMHeadModel.from_pretrained("distilgpt2").to(device)

Special tokens: ['<|endoftext|>', '<|endoftext|>']


## Create the dataset
Starting from the single_qna csv file from the amazon dataset, I sample and clean it. 

In [15]:
dataset = pd.read_csv("drive/MyDrive/gpt2/data/single_qna.csv").sample(frac=0.15)

dataset["text"] = "<|endoftext|> Q: " + dataset["Question"] + " A: "+ dataset["Answer"] + " <|endoftext|>" 
X_train, X_test, _, _ = train_test_split(dataset["text"].to_numpy(), [0]*len(dataset["text"]), test_size=0.05, random_state=42)

X_train = X_train[[ len(str(val))<1024 for val in X_train]]
X_test = X_test[[ len(str(val))<1024 for val in X_test]]

pd.DataFrame(X_train, columns=["text"]).dropna().to_csv("train_text_dataset.csv", index=False, header=True)
pd.DataFrame(X_test, columns=["text"]).dropna().to_csv("test_text_dataset.csv", index=False, header=True)

## Custom Functions

Custom function to tokenize the dataset and custom data-collator. DataCollatorQA returns a set of labels that is -100 for the question part.

In [16]:
def tokenize_function(examples):
    examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
    return tokenizer(
        examples["text"],
        truncation=False,
        max_length=1024,
        add_special_tokens=False
    )

class DataCollatorQA(DataCollatorForLanguageModeling):
    def __init__(self, tokenizer, mask_until_token, mlm=False):
        super().__init__(tokenizer=tokenizer, mlm=mlm)
        self.mask_until_token = torch.tensor(tokenizer.encode(mask_until_token))
    
    def __call__(self, examples):
        batch = super().__call__(examples)
        if self.mask_until_token is not None:
            labels = batch['input_ids'].clone()
            for i in range(labels.shape[0]):
              for j in range(labels.shape[1]):
                  if torch.equal(labels[i, j:j+len(self.mask_until_token)], self.mask_until_token):
                    labels[i][:j+len(self.mask_until_token)] = -100
              batch['labels'] = labels
        return batch

## Load the data

In [17]:
dataset = datasets.load_dataset('csv', data_files={ "train": "train_text_dataset.csv", "validation": "test_text_dataset.csv" }, delimiter=",")

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-1ae83bcdc9d27653/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-1ae83bcdc9d27653/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [18]:
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns="text"
)

Map (num_proc=4):   0%|          | 0/197009 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/10376 [00:00<?, ? examples/s]

## Freezing some layers

To finetune the model I decided to freeze the embedding layer and 3 out of 6 decoder layers.

In [19]:
model.transformer.wte.requires_grad_(False)
model.transformer.wpe.requires_grad_(False)
for i in range(3):
   model.transformer.h[i].requires_grad_(False)

print(f"Total number of parameters: {np.sum([int(np.prod(p.shape)) for p in model.parameters()])}")
print(f"Total number of trainable parameters: {np.sum([int(np.prod(p.shape)) for p in model.parameters() if p.requires_grad])}")

Total number of parameters: 81912576
Total number of trainable parameters: 21265152


## Train the model

In [20]:
data_collator = DataCollatorQA(tokenizer, " A:", mlm=False)

training_args = TrainingArguments(
          output_dir=f"drive/MyDrive/gpt2/models/{pd.Timestamp.now().strftime('%Y-%m-%d/%H-%M')}",
          overwrite_output_dir=True,
          evaluation_strategy='steps',
          per_device_train_batch_size=20,
          eval_steps = 2500,
          weight_decay=1e-2,
          learning_rate=1e-4,
          fp16=True,
          gradient_accumulation_steps=1,
          warmup_ratio = 0.3,
          save_strategy="epoch",
          num_train_epochs=2,
          label_names=None,
      )

trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset= tokenized_dataset["train"],
          eval_dataset=tokenized_dataset["validation"],
  )
      
trainer.train()
trainer.save_model()



Step,Training Loss,Validation Loss
2500,1.0915,1.383018
5000,1.0797,1.359098
7500,1.0473,1.341636
10000,1.038,1.3302
12500,1.0463,1.322267
15000,1.0292,1.316416
17500,1.0137,1.312104


## Test the model

In [16]:
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.add_special_tokens({'pad_token': '<|endoftext|>'})
print(f"Special tokens: {tokenizer.all_special_tokens}")

# Instantiate a model
model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/gpt2/models/2023-04-02/13-57/checkpoint-19702/").to(device)

Special tokens: ['<|endoftext|>', '<|endoftext|>']


In [10]:
def eval_function(question, do_sample=True, max_length=40, top_p=0.9):
  input_ids = tokenizer.encode(f"<|endoftext|> Q: {question} A:", 
                             return_tensors='pt',
                             add_special_tokens=False).to(device)

  # generate text until the output length (which includes the context length) reaches 50
  greedy_output = model.generate(input_ids, 
                                do_sample = do_sample, 
                                max_length = max_length,
                                pad_token_id=50256,
                                top_p = top_p,
                                )
  print("Output:\n" + 100 * '-')
  print(tokenizer.decode(greedy_output[0], skip_special_tokens = True)[:len(question)+4])
  print(tokenizer.decode(greedy_output[0], skip_special_tokens = True)[len(question)+4:])

In [24]:
eval_function("Can I have it back?")

Output:
----------------------------------------------------------------------------------------------------
 Q: Can I have it back?
 A: Yes. 


In [37]:
eval_function("When the smartphone was released?")

Output:
----------------------------------------------------------------------------------------------------
 Q: When the smartphone was released?
 A: Mine was released in December 2015. 


In [34]:
eval_function("What is the size of the Galaxy S3?")

Output:
----------------------------------------------------------------------------------------------------
 Q: What is the size of the Galaxy S3?
 A: 5.5 inches 


In [36]:
eval_function("Does it fit NIKON D5000?")

Output:
----------------------------------------------------------------------------------------------------
 Q: Does it fit NIKON D5000?
 A: Yes, it does fit NIKON D5000, so that's the reason why it's better. 
