In [None]:
pip install transformers

In [None]:
pip install evaluate

In [None]:
import torch
import json
import os
import random
import sys
from torch import nn
import numpy as np
from tqdm import tqdm
from itertools import chain
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset,DataLoader
from torch.utils.data import DataLoader, RandomSampler
from utils import *
from transformers import AdamW
from transformers import get_constant_schedule_with_warmup

from transformers import PreTrainedModel, PretrainedConfig
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import GPT2LMHeadModel, GPT2Tokenizer,GPT2Config

SEED = 42
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f21342c90f0>

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

##### Prompt and Labels less than max_length


In [None]:
filename = 'train.json'
model_name='gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
pairs = get_supervised_data(filename)

In [None]:
# Calculate the percentage of prompts with a length less than 1024 tokens
prompts_len = [len(tokenizer(pair[0])['input_ids']) for pair in pairs]
percentage = sum(1 for len_ in prompts_len if len_ < 1024) / len(prompts_len)


Token indices sequence length is longer than the specified maximum sequence length for this model (5969 > 1024). Running this sequence through the model will result in indexing errors


In [None]:
# Calculate the percentage of labels with a length less than 1024 tokens

labels_len=[len(tokenizer(pair[1])['input_ids']) for pair in pairs]
(sum(1 for len_ in labels_len if len_ < 1024) / len(labels_len))

0.9777435370655709

### Dataset

In [None]:
def replace_elements(input_ids, start_index):
    num_elements = input_ids.size(0)
    replace_values = torch.full((num_elements - start_index,), -100, dtype=torch.int32, device=input_ids.device)
    input_ids[start_index:] = replace_values

    return input_ids

class Dataset_GPT2(Dataset):
    def __init__(self, tokenizer, filename):
        # Initialize the Dataset_Bart class
        self.tokenizer   = tokenizer
        self.input_ids   = []
        self.attentions  = []
        self.token_types = []
        self.tokenizer.pad_token = self.tokenizer.eos_token

        pairs = get_supervised_data(filename)
        pairs = random.sample(pairs, int(len(pairs) * 0.1))  # take 20% of the final data (just for now)
        max_input_length = 1024

        # Process each pair of prompts and labels
        for pair in pairs:
            prompt, labels = pair[0], pair[1]  # question and answer

            # Concatenate input with special tokens
            input = self.tokenizer.bos_token + prompt + self.tokenizer.eos_token + self.tokenizer.eos_token + labels + self.tokenizer.eos_token

            # Tokenize the whole input (question + answer)
            input_encodings = self.tokenizer(
                input,
                max_length=max_input_length,
                padding="max_length",
                truncation=True,
                return_tensors="pt"
            ).to(DEVICE)

            #in order the model to better understand the context and differentiate between the different parts of the input (question and answer)
            flat_tensor = input_encodings['input_ids'].view(-1)
            flat_tensor = torch.nonzero(flat_tensor == self.tokenizer.eos_token_id).flatten()
            eos_index = flat_tensor[2].item() if len(flat_tensor) >= 3 else flat_tensor[-1].item()
            token_type_ids =  np.ones(len(input_encodings['input_ids'][0]))
            token_type_ids[:eos_index] = 0

            # Replace the elements from last occurrence to the end with -100
            if len(flat_tensor) >= 5:
              input_encodings['input_ids'][0][flat_tensor[4] + 1:] = -100

            # Store input_ids and attention masks for prompts
            self.input_ids.append(input_encodings['input_ids'])
            self.attentions.append(input_encodings["attention_mask"][0])
            self.token_types.append(token_type_ids)


    def __len__(self):
        # Return the length of the dataset
        return len(self.input_ids)

    def __getitem__(self, idx):
        # Get the input_ids and attention mask for the given index
        return (
            self.input_ids[idx],
            self.attentions[idx],
            self.token_types[idx]
        )



In [None]:
# Create instances of the Dataset_GPT2 class for training, validation, and testing datasets
training_dataset = Dataset_GPT2(tokenizer, 'train.json')
validation_dataset = Dataset_GPT2(tokenizer, 'val.json')
testing_dataset = Dataset_GPT2(tokenizer  , 'test.json')

In [None]:
validation_dataset[0]

(tensor([[50256,    34,   343,  ...,  -100,  -100,  -100]], device='cuda:0'),
 tensor([1, 1, 1,  ..., 0, 0, 0], device='cuda:0'),
 array([0., 0., 0., ..., 1., 1., 1.]))

In [None]:
training_dataset[0]

(tensor([[50256,  1135,   869,  ...,  -100,  -100,  -100]], device='cuda:0'),
 tensor([1, 1, 1,  ..., 0, 0, 0], device='cuda:0'),
 array([0., 0., 0., ..., 1., 1., 1.]))

### Custom  model

In [None]:
from transformers import AutoTokenizer, GPT2Config

class GPT2_Fine_Tuned_Model_Config(GPT2Config):
    model_type = "GPT2_Fine_Tuned_Model"

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

class GPT2_Fine_Tuned_Model(GPT2LMHeadModel):
    def __init__(self, config):
        """
        GPT2-based fine-tuned model for a specific task.

        Args:
            config: The configuration object for the model.
        """
        super().__init__(config)
        self.config = config

        # Load the base GPT2 model
        model_name = "gpt2"
        self.model = GPT2LMHeadModel.from_pretrained(model_name)

        # Use the GPT2 tokenizer for encoding and decoding
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)

    def forward(self, batch_input_ids, batch_attention_mask,batch_token_types):
        """
        Perform a forward pass of the GPT2-based fine-tuned model.

        Args:
            batch_input_ids: The input IDs of the batch.
            batch_attention_mask: The attention masks of the batch.

        Returns:
            The output of the model.
        """
        #input_ids, attention_mask, token_type_ids = batch

        batch_token_types = batch_token_types.to(torch.long).to(DEVICE)

        outputs = self.model(
             batch_input_ids,
             labels=batch_input_ids,
             attention_mask = batch_attention_mask,
             token_type_ids=batch_token_types
          )
        return outputs



## Training Loop

In [None]:
def train_fine_tuned_model_gpt(
    model,
    tokenizer,
    training_dataset,
    validation_dataset,
    epochs,
    learningRate,
    batch_size,
    model_save_root,
    warmup_percent=0.2,
    max_grad_norm=1.0,
):

    # Create the optimizer
    optimizer = torch.optim.Adam(
        model.parameters(), lr=learningRate, betas=(0.9, 0.95))

    # Create a DataLoader for the training dataset
    train_dataloader = DataLoader(
        training_dataset,
        batch_size=batch_size
    )

    total_steps = batch_size * epochs  # Total number of training steps
    warmup_steps = int(warmup_percent * total_steps)  # Number of warmup steps

    scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps)

    model.zero_grad()
    best_loss = 100
    iteration_count = 0  # Count of iterations

    # Train the model
    for epoch in range(epochs):

        train_loss_accum = 0  # Accumulated training loss for the epoch
        epoch_train_step = 0  # Number of training steps in the epoch
        model.train()

        for batch in tqdm(train_dataloader, desc="Training"):  # Iterate over training batches
            optimizer.zero_grad()

            epoch_train_step += 1

            input_ids, attention_mask, token_type_ids = batch

            # Forward pass and compute the loss
            loss = model(input_ids, attention_mask, token_type_ids)[0]

            loss.backward()  # Backpropagation
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping

            train_loss_accum += loss.item()  # Accumulate training loss
            iteration_count += 1

            if iteration_count % 1200 == 0:
                print(f"Loss for batch {iteration_count}: {train_loss_accum / iteration_count}")

            optimizer.step()  # Update model parameters
            scheduler.step()  # Update learning rate schedule

        epoch_train_loss = train_loss_accum / epoch_train_step  # Average training loss for the epoch

        # Perform validation after each epoch
        validation_loss = validation(model, validation_dataset, batch_size)

        print(
            f"Epoch: {epoch} | Training Loss: {epoch_train_loss:.3f} | Validation Loss: {validation_loss:.3f}"
        )

        if validation_loss < best_loss:
            # Save the best model, configuration, and tokenizer
            model.save_pretrained(model_save_root)
            model.config.save_pretrained(model_save_root)
            tokenizer.save_pretrained(model_save_root)

            best_loss = validation_loss  # Update the best validation loss

            print("Model Saved!")

        print("---------------------------------")


### Validation Loop

In [None]:

def validation(model, validation_dataset, batch_size):
    # Create a DataLoader to iterate through the validation dataset in batches
    eval_dataloader = DataLoader(
        validation_dataset,
        batch_size=batch_size
    )

    eval_loss_accum = 0  # Accumulator for the evaluation loss
    eval_step = 0  # Counter for the number of evaluation steps

    model.zero_grad()
    model.eval()  # Put the model in evaluation mode

    # Iterate through the validation dataset
    for batch in tqdm(eval_dataloader, desc="Validation"):
        with torch.no_grad():

          # Unpack the batch into input_ids, attention_mask, and token_type_ids
          input_ids, attention_mask, token_type_ids = batch

          # Compute the model's output for the given inputs
          loss = model(input_ids, attention_mask, token_type_ids)[0]

          # Accumulate the loss
          eval_loss_accum += loss.item()
          eval_step += 1

    validation_loss = eval_loss_accum / eval_step

    return validation_loss


Fine-tune the custom model

In [None]:

# Define the configuration for the fine-tuned GPT-2 model
config = GPT2_Fine_Tuned_Model_Config()

# Create an instance of the fine-tuned GPT-2 model and move it to the appropriate device
model = GPT2_Fine_Tuned_Model(config).to(DEVICE)

learningRate = 5e-5
model_save_root = 'fine_tuned_gpt2'
warmup_percent = 0.2
max_grad_norm = 1.0
epochs = 1
batch_size = 1

# Call the training function to train the fine-tuned GPT-2 model
train_fine_tuned_model_gpt(
    model,
    training_dataset,
    validation_dataset,
    epochs,
    learningRate,
    batch_size,
    model_save_root,
    warmup_percent,
    max_grad_norm,
)


Second approach (without the custom model)

In [None]:

# Define the configuration for the fine-tuned GPT-2 model
#config = GPT2_Fine_Tuned_Model_Config()

# Create an instance of the fine-tuned GPT-2 model and move it to the appropriate device
#model = GPT2_Fine_Tuned_Model(config).to(DEVICE)

model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name).to(DEVICE)
model.config.pad_token_id =  model.config.eos_token_id

# Use the GPT2 tokenizer for encoding and decoding
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

learningRate = 5e-5
model_save_root = 'fine_tuned_gpt2'
warmup_percent = 0.2
max_grad_norm = 1.0
epochs = 1
batch_size = 1

# Call the training function to train the fine-tuned GPT-2 model
train_fine_tuned_model_gpt(
    model,
    tokenizer,
    training_dataset,
    validation_dataset,
    epochs,
    learningRate,
    batch_size,
    model_save_root,
    warmup_percent,
    max_grad_norm,
)


Training: 100%|██████████| 5841/5841 [38:05<00:00,  2.56it/s]
Validation: 100%|██████████| 682/682 [01:19<00:00,  8.56it/s]

Epoch: 0 | Training Loss: 0.795 | Validation Loss: 0.721



