<a href="https://colab.research.google.com/github/Madhavmurari/Fine-Tuning/blob/main/Fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Step 1: Setting Up the Environment**

In [None]:
!pip install torch torchtext transformers sentencepiece pandas tqdm datasets

Collecting torchtext
  Downloading torchtext-0.18.0-cp311-cp311-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)


## **Step 2: Import important libaries**

In [None]:
from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd
import ast
import datasets
from tqdm import tqdm
import time
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

## **Step 3: Load the Datasets**

In [None]:
# Load the dataset
dataset = load_dataset("prognosis/symptoms_disease_v1")

dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


combined_disease_prediction_symptom.json:   0%|          | 0.00/3.31M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10110 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'reference', 'output', 'instruction'],
        num_rows: 10110
    })
})

In [None]:
# Convert to a pandas dataframe
updated_data = [{'Input': item['instruction'], 'Disease': item['output']} for item in dataset['train']]
df = pd.DataFrame(updated_data)

df.head(5)

Unnamed: 0,Input,Disease
0,What are the symptoms of hypertensive disease?,The following are the symptoms of hypertensive...
1,I am having the following symptoms: pain ches...,The symptoms listed indicates that the patient...
2,What are the symptoms of diabetes?,The following are the symptoms of diabetes: po...
3,"I am having the following symptoms: polyuria, ...",The symptoms listed indicates that the patient...
4,What are the symptoms of depressive disorder?,The following are the symptoms of depressive d...


## **Step 4: Select the device for Model training**

In [None]:
device='cuda' if torch.cuda.is_available() else 'cpu'

## **Step 5: Load the Tokenizer and Pre-trained Model**

In [None]:
# The tokenizer turns texts to numbers (and vice-versa)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')

# The transformer
model = GPT2LMHeadModel.from_pretrained('gpt2-medium').to(device)

model

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=3072, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=1024)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=4096, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=4096)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

## **Step6 : Dataset Preparation and Custom Dataset Class Definition**

In [None]:
# Dataset Prep
class LanguageDataset(Dataset):
    """
    An extension of the Dataset object to:
      - Make training loop cleaner
      - Make ingestion easier from pandas df's
    """
    def __init__(self, df, tokenizer):
        self.labels = df.columns
        self.data = df.to_dict(orient='records')
        self.tokenizer = tokenizer
        x = self.fittest_max_length(df)  # Fix here
        self.max_length = x

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data[idx][self.labels[0]]
        y = self.data[idx][self.labels[1]]
        text = f"{x} | {y}"
        tokens = self.tokenizer.encode_plus(text, return_tensors='pt', max_length=128, padding='max_length', truncation=True)
        return tokens

    def fittest_max_length(self, df):  # Fix here
        """
        Smallest power of two larger than the longest term in the data set.
        Important to set up max length to speed training time.
        """
        max_length = max(len(max(df[self.labels[0]], key=len)), len(max(df[self.labels[1]], key=len)))
        x = 2
        while x < max_length: x = x * 2
        return x

# Cast the Huggingface data set as a LanguageDataset we defined above
data_sample = LanguageDataset(df, tokenizer)

In [None]:
 data_sample

<__main__.LanguageDataset at 0x7e3b39d74fd0>

## **Step7: Dataset into Training and Validation Sets**

In [None]:
train_size=int(0.8*len(data_sample))
valid_size=len(data_sample)-train_size
train_data,valid_data=random_split(data_sample,[train_size,valid_size])

In [None]:
print(train_data)

<torch.utils.data.dataset.Subset object at 0x7e3b41dcd090>


## **Step 8: Create dataloader**

In [None]:
#number of epochs
num_epochs=5

#Batch size
BATCH_SIZE=8

train_loader=DataLoader(train_data,batch_size=BATCH_SIZE, shuffle=True)
valid_loader=DataLoader(valid_data,batch_size=BATCH_SIZE)

#traing parameter
batch_size=BATCH_SIZE
model_name='gpt2-medium'
gpu=0

criterion=nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer=optim.Adam(model.parameters(),lr=5e-4)

tokenizer.pad_token=tokenizer.eos_token

# Init a results dataframe
results = pd.DataFrame(columns=['epoch', 'transformer', 'batch_size', 'gpu',
                                'training_loss', 'validation_loss', 'epoch_duration_sec'])

## **Step10: Training and Validation Loop**

In [None]:
# The training loop
for epoch in range(num_epochs):
    start_time = time.time()  # Start the timer for the epoch

    # Training
    ## This line tells the model we're in 'learning mode'
    model.train()
    epoch_training_loss = 0
    train_iterator = tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{num_epochs} Batch Size: {batch_size}, Transformer: {model_name}")
    for batch in train_iterator:
        optimizer.zero_grad()
        inputs = batch['input_ids'].squeeze(1).to(device)
        targets = inputs.clone()
        outputs = model(input_ids=inputs, labels=targets)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_iterator.set_postfix({'Training Loss': loss.item()})
        epoch_training_loss += loss.item()
    avg_epoch_training_loss = epoch_training_loss / len(train_iterator)

    # Validation
    # Validation
    model.eval()
    epoch_validation_loss = 0
    total_loss = 0
    valid_iterator = tqdm(valid_loader, desc=f"Validation Epoch {epoch+1}/{num_epochs}")
    with torch.no_grad():
        for batch in valid_iterator:
            inputs = batch['input_ids'].squeeze(1).to(device)
            targets = inputs.clone()
            outputs = model(input_ids=inputs, labels=targets)
            loss = outputs.loss
            total_loss += loss.item()  # Convert tensor to scalar
            valid_iterator.set_postfix({'Validation Loss': loss.item()})
            epoch_validation_loss += loss.item()

    avg_epoch_validation_loss = epoch_validation_loss / len(valid_loader)

    end_time = time.time()  # End the timer for the epoch
    epoch_duration_sec = end_time - start_time  # Calculate the duration in seconds

    new_row = {'transformer': model_name,
               'batch_size': batch_size,
               'gpu': gpu,
               'epoch': epoch+1,
               'training_loss': avg_epoch_training_loss,
               'validation_loss': avg_epoch_validation_loss,
               'epoch_duration_sec': epoch_duration_sec}  # Add epoch_duration to the dataframe

    results.loc[len(results)] = new_row
    print(f"Epoch: {epoch+1}, Validation Loss: {total_loss/len(valid_loader)}")

Training Epoch 1/5 Batch Size: 8, Transformer: gpt2-medium: 100%|██████████| 1011/1011 [13:27<00:00,  1.25it/s, Training Loss=0.0561]
Validation Epoch 1/5: 100%|██████████| 253/253 [00:58<00:00,  4.33it/s, Validation Loss=0.0628]


Epoch: 1, Validation Loss: 0.07865695974866864


Training Epoch 2/5 Batch Size: 8, Transformer: gpt2-medium: 100%|██████████| 1011/1011 [13:32<00:00,  1.24it/s, Training Loss=0.0484]
Validation Epoch 2/5: 100%|██████████| 253/253 [00:58<00:00,  4.33it/s, Validation Loss=0.0641]


Epoch: 2, Validation Loss: 0.07475217213861556


Training Epoch 3/5 Batch Size: 8, Transformer: gpt2-medium: 100%|██████████| 1011/1011 [13:32<00:00,  1.25it/s, Training Loss=0.0429]
Validation Epoch 3/5: 100%|██████████| 253/253 [00:58<00:00,  4.34it/s, Validation Loss=0.0528]


Epoch: 3, Validation Loss: 0.07006587989245479


Training Epoch 4/5 Batch Size: 8, Transformer: gpt2-medium: 100%|██████████| 1011/1011 [13:31<00:00,  1.25it/s, Training Loss=0.0466]
Validation Epoch 4/5: 100%|██████████| 253/253 [00:58<00:00,  4.35it/s, Validation Loss=0.0664]


Epoch: 4, Validation Loss: 0.07002728383826173


Training Epoch 5/5 Batch Size: 8, Transformer: gpt2-medium: 100%|██████████| 1011/1011 [13:31<00:00,  1.25it/s, Training Loss=0.035]
Validation Epoch 5/5: 100%|██████████| 253/253 [00:58<00:00,  4.34it/s, Validation Loss=0.0595]

Epoch: 5, Validation Loss: 0.07083901229936615







## **Step11: Model Testing and Response Validation**

In [None]:
# Define the input string
input_str = "What are the symptoms of Chicken pox?"

# Encode the input string with padding and attention mask
encoded_input = tokenizer.encode_plus(
    input_str,
    return_tensors='pt',
    padding=True,
    truncation=True,
    max_length=50  # Adjust max_length as needed
)

# Move tensors to the appropriate device
input_ids = encoded_input['input_ids'].to(device)
attention_mask = encoded_input['attention_mask'].to(device)

# Set the pad_token_id to the tokenizer's eos_token_id
pad_token_id = tokenizer.eos_token_id

# Generate the output
output = model.generate(
    input_ids,
    attention_mask=attention_mask,
    max_length=50,  # Adjust max_length as needed
    num_return_sequences=1,
    do_sample=True,
    top_k=8,
    top_p=0.95,
    temperature=0.5,
    repetition_penalty=1.2,
    pad_token_id=pad_token_id
)

# Decode and print the output
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print(decoded_output)

What are the symptoms of Chicken pox? | The following are the symptoms of Chicken pox: itching, skin rash, fatigue, lethargy, high fever, headache, loss of appetite, mild fever, swelled lymph nodes, mala


## **Step 12: Save the pretrained model**

In [None]:

import shutil
from google.colab import files

# Define save directory
save_directory = "fine_tuned_gpt2-medium"

# Save model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

# Zip the model directory
shutil.make_archive(save_directory, 'zip', save_directory)

# Download the zipped model
files.download(save_directory + ".zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## **Pre-Fine Tuning output of the Query**

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained DistilGPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

# Set the padding token to the end-of-sequence token (common practice for GPT-2-based models)
tokenizer.pad_token = tokenizer.eos_token

# Define the input query
input_query = "What are the symptoms of Chicken pox?"

# Tokenize the input query
input_tokens = tokenizer.encode_plus(
    input_query,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=50
)

# Generate response using the pre-trained model
output_tokens = model.generate(
    input_ids=input_tokens["input_ids"],
    attention_mask=input_tokens["attention_mask"],
    max_length=50,  # Adjust max_length if needed
    num_return_sequences=1,
    do_sample=True,
    top_k=8,
    top_p=0.95,
    temperature=0.7,
    repetition_penalty=1.2,
    pad_token_id=tokenizer.pad_token_id
)

# Decode the generated output to human-readable text
decoded_output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

# Print the results
print("Pre-Fine-Tuning Response:")
print(decoded_output)

Pre-Fine-Tuning Response:
What are the symptoms of Chicken pox?
Chickenpox is a disease that causes severe inflammation and swelling in your eyes, ears or mouth. It can also cause serious infection with other infectious diseases such as pneumonia and meningitis (inflammation of
