In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from peft import get_peft_model, LoraConfig, TaskType
from transformers import TrainingArguments, Trainer
from torch.utils.data import DataLoader
from torch.amp import autocast
import os
import sys

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datasets import load_dataset

# Load the dataset
ds = load_dataset("keivalya/MedQuad-MedicalQnADataset")

# Extract the training and validation subsets
training_data = ds['train'][:14000]
validation_data = ds['train'][14000:15000]

# Convert the dataset columns into dictionary format manually
training_question = training_data['Question']
training_answer = training_data['Answer']
validation_question = validation_data['Question']
validation_answer = validation_data['Answer']

print(f"Fifth training question: {training_question[4]}")
print(f"Fifth training answer: {training_answer[4]}")

Fifth training question: What are the treatments for Lymphocytic Choriomeningitis (LCM) ?
Fifth training answer: Aseptic meningitis, encephalitis, or meningoencephalitis requires hospitalization and supportive treatment based on severity. Anti-inflammatory drugs, such as corticosteroids, may be considered under specific circumstances. Although studies have shown that ribavirin, a drug used to treat several other viral diseases, is effective against LCMV in vitro, there is no established evidence to support its routine use for treatment of LCM in humans.


In [3]:
import torch
from torch.utils.data import Dataset

class makeDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs['input_ids'])

    def __getitem__(self, idx):
        # Extract the input_ids and attention_mask for the question
        input_ids = self.inputs['input_ids'][idx]
        attention_mask = self.inputs['attention_mask'][idx]

        # Extract the labels (input_ids for the answer)
        labels = self.targets['input_ids'][idx]

        # Return the input and output as a dictionary
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

In [4]:
# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"  # Default to GPT small
GPTmodel = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,  # Rank of the low-rank adaptation matrices
    lora_alpha=32,  # LoRA scaling factor
    lora_dropout=0.1,  # Dropout for LoRA layers
    target_modules = ["c_attn", "c_proj"]
)

# Prepare model for LoRA tuning
model = get_peft_model(GPTmodel, lora_config)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = model.to(device)

#tokenize dataset
tokenizer.pad_token = tokenizer.eos_token
maxLength = 64
tokenized_training_question = tokenizer(training_question, truncation=True, padding='max_length', return_tensors="pt", max_length = maxLength)
tokenized_training_answer = tokenizer(training_answer, truncation=True, padding=True, return_tensors="pt", max_length = maxLength)
tokenized_validation_question = tokenizer(validation_question, truncation=True, padding='max_length', return_tensors="pt", max_length = maxLength)
tokenized_validation_answer = tokenizer(validation_answer, truncation=True, padding=True, return_tensors="pt", max_length = maxLength)

print(f"Tokenized Training Questions Shape: {tokenized_training_question['input_ids'].shape}")
print(f"Tokenized Training Answers Shape: {tokenized_training_answer['input_ids'].shape}")

# Make sure it's divisible by batch size so last batch works fine
batch_size = 20
# tokenized_training_question = tokenized_training_question[:len(tokenized_training_question) // batch_size * batch_size]
# print("Training Question Tokenized Shape:", tokenized_training_question['input_ids'].shape)
# print("Training Question Tokenized Example:", tokenized_training_question['input_ids'][0])
# tokenized_training_answer = tokenized_training_answer[:len(tokenized_training_answer) // batch_size * batch_size]
# tokenized_validation_question = tokenized_validation_question[:len(tokenized_validation_question) // batch_size * batch_size]
# tokenized_validation_answer = tokenized_validation_answer[:len(tokenized_validation_answer) // batch_size * batch_size]

train_dataset = makeDataset(tokenized_training_question, tokenized_training_answer)
val_dataset = makeDataset(tokenized_validation_question, tokenized_validation_answer)
print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")

# Define training arguments
num_epochs = 2  # Number of training epochs
training_args = TrainingArguments(
    output_dir='./results',        # Directory to save model checkpoints
    num_train_epochs=num_epochs,            # Number of training epochs
    per_device_train_batch_size=batch_size, # Batch size per device
    per_device_eval_batch_size=batch_size,  # Batch size for evaluation
    warmup_steps=2,              # Number of warmup steps
    weight_decay=0.01,             # Weight decay
    logging_dir='./logs',          # Directory to save logs
    logging_steps=1,              # Log every X steps
)

# Create Trainer instance
trainer = Trainer(
    model=model,                     # The model you are fine-tuning
    args=training_args,              # Training arguments
    train_dataset=train_dataset,     # Your training dataset
    eval_dataset=val_dataset,
)

# Get model sizes
def print_model_size(path):
    size = 0
    for f in os.scandir(path):
        size += os.path.getsize(f)
    print(f"Model size: {(size / 1e6):.2} MB")

def print_trainable_parameters(model, label):
    parameters, trainable = 0, 0    
    for _, p in model.named_parameters():
        parameters += p.numel()
        trainable += p.numel() if p.requires_grad else 0
    print(f"{label} trainable parameters: {trainable:,}/{parameters:,} ({100 * trainable / parameters:.2f}%)")

#Fine-tune the model
print(f"Model is on device: {next(model.parameters()).device}")
print_model_size(training_args.output_dir)
print_trainable_parameters(model, "Before training")
trainer.train()

trainer.save_model("./medLora-model")



Using device: cuda


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Tokenized Training Questions Shape: torch.Size([14000, 64])
Tokenized Training Answers Shape: torch.Size([14000, 64])
Training dataset size: 14000
Validation dataset size: 1000
Model is on device: cuda:0
Model size: 0.016 MB
Before training trainable parameters: 811,008/125,250,816 (0.65%)


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss
1,13.9165
2,14.1377
3,13.9125
4,13.6601
5,14.0201
6,14.1708
7,13.9082
8,13.6848
9,13.5881
10,13.746


In [16]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the fine-tuned model and tokenizer
model_name = "medLora-model"  # Replace with your model's path
#tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Evaluate the model
model.eval()
total_loss = 0
num_batches = 0
batch_size = 8  # Adjust based on your memory constraints

with torch.no_grad():
    for i in range(0, len(val_dataset), batch_size):
        if(i+batch_size >= len(val_dataset)):
            break
        batch = val_dataset[i:i + batch_size]
        # Get input_ids and attention_mask from the batch
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask'] if 'attention_mask' in batch else None  # Optional

        # Pass input_ids as labels for loss calculation
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels= batch['labels'])
        
        loss = outputs.loss
        print(loss)
        total_loss += loss.item()
        num_batches += 1

# Calculate average loss and perplexity
average_loss = total_loss / num_batches
perplexity = torch.exp(torch.tensor(average_loss)).item()

print(f"Average Loss: {average_loss:.4f}")
print(f"Perplexity: {perplexity:.4f}")

tensor(7.3183)
tensor(7.3294)
tensor(7.2995)
tensor(7.3135)
tensor(7.2060)
tensor(7.3806)
tensor(7.0278)
tensor(7.2487)
tensor(6.6319)
tensor(7.2249)
tensor(7.6197)
tensor(7.1879)
tensor(7.6788)
tensor(6.9931)
tensor(7.7484)
tensor(7.5299)
tensor(7.2924)
tensor(6.9563)
tensor(7.0148)
tensor(6.7854)
tensor(7.3179)
tensor(7.3432)
tensor(7.3739)
tensor(7.7498)
tensor(7.3745)
tensor(7.3744)
tensor(7.4357)
tensor(7.4930)
tensor(7.2250)
tensor(7.2402)
tensor(7.3346)
tensor(7.7874)
tensor(7.7306)
tensor(7.5272)
tensor(7.0317)
tensor(7.4222)
tensor(6.9020)
tensor(7.4866)
tensor(7.2764)
tensor(7.5234)
tensor(7.1299)
tensor(7.2833)
tensor(7.4841)
tensor(7.3719)
tensor(7.0758)
tensor(7.8136)
tensor(7.0310)
tensor(7.2850)
tensor(7.0177)
tensor(7.0615)
tensor(7.2285)
tensor(7.3028)
tensor(7.4852)
tensor(7.7306)
tensor(7.2781)
tensor(7.2267)
tensor(7.3286)
tensor(7.5175)
tensor(6.9162)
tensor(7.1635)
tensor(7.3636)
tensor(7.1507)
tensor(7.5057)
tensor(7.0711)
tensor(7.1043)
tensor(7.2963)
tensor(7.3

In [15]:
# Load the default GPT-2 Small model and tokenizer
GPTmodel = "gpt2"  # This points to the default GPT-2 Small model
tokenizer = AutoTokenizer.from_pretrained(GPTmodel)
model = AutoModelForCausalLM.from_pretrained(GPTmodel)

# Evaluate the model
model.eval()
total_loss = 0
num_batches = 0
batch_size = 8  # Adjust based on your memory constraints

with torch.no_grad():
    for i in range(0, len(val_dataset), batch_size):
        if(i+batch_size >= len(val_dataset)):
            break
        batch = val_dataset[i:i + batch_size]
        # Get input_ids and attention_mask from the batch
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask'] if 'attention_mask' in batch else None  # Optional

        # Pass input_ids as labels for loss calculation
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels= batch['labels'])
        
        loss = outputs.loss
        print(loss)
        total_loss += loss.item()
        num_batches += 1

# Calculate average loss and perplexity
average_loss = total_loss / num_batches
perplexity = torch.exp(torch.tensor(average_loss)).item()

print(f"Average Loss: {average_loss:.4f}")
print(f"Perplexity: {perplexity:.4f}")



tensor(14.2888)
tensor(14.9782)
tensor(15.1596)
tensor(14.1524)
tensor(14.4816)
tensor(14.5441)
tensor(14.2112)
tensor(14.7085)
tensor(13.4263)
tensor(14.1028)
tensor(15.1642)
tensor(15.2056)
tensor(14.7975)
tensor(13.5777)
tensor(14.9535)
tensor(15.0056)
tensor(15.0133)
tensor(14.6804)
tensor(14.3289)
tensor(14.1136)
tensor(14.9169)
tensor(14.7038)
tensor(14.5362)
tensor(15.1375)
tensor(14.9126)
tensor(14.8563)
tensor(14.3381)
tensor(14.8082)
tensor(14.2995)
tensor(14.7002)
tensor(14.8756)
tensor(15.0656)
tensor(14.5521)
tensor(14.6520)
tensor(14.3047)
tensor(14.7539)
tensor(14.2916)
tensor(14.9533)
tensor(14.4720)
tensor(15.0784)
tensor(14.3151)
tensor(14.1776)
tensor(14.9997)
tensor(14.7585)
tensor(14.1314)
tensor(15.3629)
tensor(14.5348)
tensor(14.7129)
tensor(14.2549)
tensor(14.2690)
tensor(14.9771)
tensor(14.5972)
tensor(14.8759)
tensor(14.3634)
tensor(14.2969)
tensor(14.1811)
tensor(15.1462)
tensor(14.9558)
tensor(14.5929)
tensor(15.2540)
tensor(15.2891)
tensor(14.1834)
tensor(1