In [3]:
# import necessary packages
import sys, os
import torch 
import numpy as np
import evaluate
from trl import SFTTrainer, setup_chat_format
from transformers import (pipeline,
                          AutoTokenizer,
                          AutoModelForCausalLM,
                          DataCollatorWithPadding,
                          get_scheduler)
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from IPython.display import clear_output

sys.path.append('../')

# custom imports
from utils.GetLowestGPU import GetLowestGPU

device = GetLowestGPU()

Device set to cuda:2


In [4]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Instantiate Model and Dataset

In [5]:
# options
model_path = "meta-llama/Meta-Llama-3-8B-Instruct"
dataset_path = "ruslanmv/ai-medical-chatbot" #test dataset

# load tokenizer and model
pipeline = pipeline(
    "text-generation",
    model=model_path,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

pipeline.tokenizer.pad_token = pipeline.tokenizer.eos_token
pipeline.model.generation_config.pad_token_id = pipeline.tokenizer.eos_token_id

# pipeline.model, pipeline.tokenizer = setup_chat_format(pipeline.model, pipeline.tokenizer)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
# load dataset
raw_dataset = load_dataset(dataset_path, split = 'train[:1%]')

# check format of data
raw_dataset = raw_dataset.train_test_split(test_size=0.1)
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['Description', 'Patient', 'Doctor'],
        num_rows: 2312
    })
    test: Dataset({
        features: ['Description', 'Patient', 'Doctor'],
        num_rows: 257
    })
})

# Preprocessing

In [7]:
# preprocess data
# def format_chat(row):
#     row_json_inp = [{'role': 'system', 'content' : 'you are a helpful medical chatbot'},
#                     {'role': 'user', 'content': row["Patient"]}]
#     row_json_out = [{'role': 'assistant', 'content': row["Doctor"]}]
#     row["user"] = pipeline.tokenizer.apply_chat_template(row_json_inp, tokenize=False)
#     row["assistant"] = pipeline.tokenizer.apply_chat_template(row_json_out, tokenize=False)
#     return row

def preprocess_data(examples):
    inp = examples["Patient"]
    out = examples["Doctor"]
    tokenized_data = pipeline.tokenizer(text=inp, 
                               text_target=out,
                               padding='max_length', 
                               truncation=True, 
                               max_length=100)
    return tokenized_data

In [8]:
chat_dataset = raw_dataset.map(format_chat)
chat_dataset['test'][0]

NameError: name 'format_chat' is not defined

In [9]:
# add special tokens to tokenizer
tokenized_dataset = raw_dataset.map(preprocess_data, 
                                    batched=True,
                                    remove_columns=raw_dataset['train'].column_names)
tokenized_dataset.with_format("torch")

# check tokenized dataset output
tokenized_dataset



Map:   0%|          | 0/2312 [00:00<?, ? examples/s]

Map:   0%|          | 0/257 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2312
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 257
    })
})

# Create Dataloaders

In [10]:
# instantiate data collator
data_collator = DataCollatorWithPadding(tokenizer=pipeline.tokenizer)

# options
batch_size = 1

train_dataloader = DataLoader(tokenized_dataset['train'],
                              batch_size=batch_size, 
                              collate_fn=data_collator)

val_dataloader = DataLoader(tokenized_dataset['test'],
                            batch_size=batch_size,
                            collate_fn=data_collator)

In [11]:
# inspect sample batch
batch = next(iter(train_dataloader))
{key: val.shape for key, val in batch.items()}

{'input_ids': torch.Size([1, 100]),
 'attention_mask': torch.Size([1, 100]),
 'labels': torch.Size([1, 100])}

In [12]:
outputs = pipeline.model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(14.8656, grad_fn=<ToCopyBackward0>) torch.Size([1, 100, 128256])


In [13]:
# test pre training
text = [{'role': 'system', 'content': 'you are a helpful medical chatbot'},
        {'role': 'user', 'content': 'I have a headache. What should I do?'}]
print(pipeline(text, max_length=100, truncation=True)[0]['generated_text'])

[{'role': 'system', 'content': 'you are a helpful medical chatbot'}, {'role': 'user', 'content': 'I have a headache. What should I do?'}, {'role': 'assistant', 'content': "Sorry to hear that you're experiencing a headache! As a helpful medical chatbot, I'd be happy to guide you through some steps to help alleviate your discomfort.\n\nFirst, let's try to identify the type of headache you're experiencing:\n\n1. Is it a sharp, stabbing pain or a dull ache?\n2. Is it located on"}]


# Training

In [14]:
# options
optimizer = AdamW(pipeline.model.parameters(), lr=1e-5)
num_epochs = 1

# test after training
text = [{'role': 'system', 'content': 'You are a helpful medical chatbot'},
        {'role': 'user', 'content': 'I have a migraine. What should I do?'}]

# loop
for epoch in range(num_epochs):
    
    print("=====================")
    print(f"Epoch {epoch + 1}")
    print("=====================")

    # set model to train mode
    pipeline.model.train()

    # initialize train loss, val loss
    running_train_loss = 0.0

    # loop through train data
    print("Training...")
    i = 0
    for batch in tqdm(train_dataloader):

        # grab batch and map to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # forward pass
        outputs = pipeline.model(**batch)
        loss = outputs.loss
        print(f"batch loss: {loss:.4f}\r", end="")

        running_train_loss += loss.item()

        # backward pass
        loss.backward()

        # update optimizer
        optimizer.step()

        # zero gradients
        optimizer.zero_grad()
        
        i += 1

        # if i % 10 == 0:
        print(pipeline(text, max_length=100, truncation=True)[0]['generated_text'])
            

    train_loss = running_train_loss / len(train_dataloader)
    print(f"Avg. Train Loss: {train_loss:.4f}")
        #   , Avg. Val Loss: {val_loss}")
    # print("Evaluation metrics:", metric.compute())

print("Training Complete!")

Epoch 1
Training...


  0%|          | 0/2312 [00:00<?, ?it/s]

batch loss: 14.8656

OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU  has a total capacity of 39.39 GiB of which 68.12 MiB is free. Process 1970489 has 2.46 GiB memory in use. Process 2126839 has 4.62 GiB memory in use. Process 2613379 has 416.00 MiB memory in use. Process 2648541 has 26.04 GiB memory in use. Including non-PyTorch memory, this process has 5.76 GiB memory in use. Of the allocated memory 5.16 GiB is allocated by PyTorch, and 95.61 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# Prediction

In [15]:
# test after training
text = [{'role': 'system', 'content': 'You are a helpful medical chatbot'},
        {'role': 'user', 'content': 'I have a migraine. What should I do?'}]
print(pipeline(text, max_length=100, truncation=True)[0]['generated_text'])

[{'role': 'system', 'content': 'You are a helpful medical chatbot'}, {'role': 'user', 'content': 'I have a migraine. What should I do?'}, {'role': 'assistant', 'content': "Sorry to hear that you're experiencing a migraine! As a helpful medical chatbot, I'd be happy to guide you through some steps to help alleviate your symptoms.\n\n**Immediate Relief:**\n\n1. **Stay calm**: Take a few deep breaths, and try to relax. This can help reduce stress, which can exacerbate migraines"}]
