In [1]:
import torch
from torch import nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from pathlib import Path
import kagglehub
import pandas as pd
import itertools
from peft import LoraConfig, get_peft_model
import deepspeed

2024-12-31 13:03:57.347258: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735643037.366618   16596 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735643037.372661   16596 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[2024-12-31 13:03:59,725] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/mikael/anaconda3/compiler_compat/ld: cannot find -lcufile: No such file or directory
collect2: error: ld returned 1 exit status


In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# DEVICE = torch.device("cpu")
MODEL_NAME = "microsoft/DialoGPT-small"
EPOCHS = 2
DEEPSPEED_CONFIG = "deepspeed_config.json"
DEVICE.type

'cuda'

In [3]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype="auto",
    device_map="auto",
    quantization_config=BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
)#.to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding_side="left", truncation=True)
tokenizer.pad_token = tokenizer.eos_token

In [4]:
path = Path(kagglehub.dataset_download("artemminiailo/medicalconversations2disease")) / "medical_conversations.csv"
df = pd.read_csv(path, names=('text', 'label'), skiprows=1)
df.head()

Unnamed: 0,text,label
0,User: I’ve been sneezing a lot today and my no...,allergy
1,User: I’ve developed a rash after eating some ...,allergy
2,"User: My eyes are swollen and itchy, and I can...",allergy
3,User: I’ve been getting headaches and a stuffy...,allergy
4,"User: Every time I eat nuts, my mouth itches. ...",allergy


Generate chat templates from raw data.

In [5]:
def generate_template(conversation):
    roles = {"user": "user", "bot": "assistant"}
    lines = conversation.split("</s>")
    template = [
        {
            "role": "system",
            "content": "You are an empathetic doctor chatbot that asks follow-up questions about the patient's symptoms and explains very briefly what the health problem might be.",
        }
    ]
    for line in lines:
        sep_idx = line.find(":")
        if sep_idx == -1:
            continue
        message = {
            "role": roles[line[:sep_idx].lower().strip()],
            "content": line[sep_idx + 1:].strip(),
        }
        template.append(message)
    templates = []
    for i, message in enumerate(template):
        if message["role"] == "assistant":
            templates.append(template[:i+1])
    return templates

In [6]:
def pad_mask(input_ids):
    pad_length = len(list(itertools.takewhile(lambda x: x == tokenizer.pad_token_id, input_ids)))
    return torch.cat([torch.zeros(pad_length), torch.ones(len(input_ids) - pad_length)]).long()

def generate_training_data(chat_templates):
    inputs_tokenized = tokenizer.apply_chat_template(chat_templates, padding=True, return_tensors="pt")
    inputs_left_shifted = inputs_tokenized[:, :-1]

    attention_mask = torch.stack([pad_mask(input_ids) for input_ids in inputs_left_shifted])

    label_templates = [[template[-1]] for template in chat_templates]
    labels_tokenized = tokenizer.apply_chat_template(
        label_templates,
        padding="max_length",
        max_length=inputs_tokenized.shape[1],
        # add_special_tokens=False,
        return_tensors="pt"
    )
    labels_fixed = torch.where(torch.stack([pad_mask(labels) for labels in labels_tokenized]) == 1, labels_tokenized, -100)
    labels_right_shifted = labels_fixed[:, 1:]


    return {
        'input_ids': inputs_left_shifted.cpu(),
        'attention_mask': attention_mask.cpu(),
        'labels': labels_right_shifted.cpu(),
    }

In [7]:
chat_templates = list(itertools.chain(*map(generate_template, df['text'])))[:5] # chain flattens 2d list to 1d

template = chat_templates[:2]
res = generate_training_data(chat_templates[:2])
print(res['input_ids'].shape, res['attention_mask'].shape, res['labels'].shape)
print('input_ids', tokenizer.batch_decode(res['input_ids']))
# print('labels', [tokenizer.batch_decode(res['labels'][i, :]) for i in range(2)])
print('mask', res['attention_mask'])
print('ids', res['input_ids'])
print('lbl', res['labels'])

torch.Size([2, 106]) torch.Size([2, 106]) torch.Size([2, 106])
input_ids ["<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>You are an empathetic doctor chatbot that asks follow-up questions about the patient's symptoms and explains very briefly what the health problem might be.<|endoftext|>I’ve been sneezing a lot today and my nose feels congested.<|endoftext|>That sounds like it could be an allergy. Do you know what might be triggering it?", "You are an empathetic doctor chatbot that asks follow-up questions about the patient's symptoms and explains very brief

In [8]:
def calculate_loss(logits, labels):
    loss_fn = nn.CrossEntropyLoss()
    return loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))

In [9]:
class Conversations(Dataset):
    def __init__(self, df):
        chat_templates = list(itertools.chain(*map(generate_template, df['text']))) # chain flattens 2d list to 1d
        self.sequences = generate_training_data(chat_templates)
    
    def __len__(self):
        return len(self.sequences['input_ids'])
    
    def __getitem__(self, idx):
        return (self.sequences['input_ids'][idx], self.sequences['attention_mask'][idx]), self.sequences['labels'][idx]

In [10]:
# input_ids = training_data['input_ids'][0].unsqueeze(0).to(DEVICE)
# mask = training_data['attention_mask'][0].unsqueeze(0).to(DEVICE)
# labels = training_data['labels'][0].to(DEVICE)
# logits = model(input_ids, attention_mask=mask).logits

In [11]:
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Linear4bit(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear4bit(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Linear4bit(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear4bit(in_features=3072, out_features=768, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_a

In [12]:
lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=16,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=['c_attn', 'c_proj']
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,622,016 || all params: 126,061,824 || trainable%: 1.2867


In [13]:
dataset = Conversations(df)
loader = DataLoader(dataset, batch_size=8)
optimizer = AdamW(model.parameters(), lr=1e-4)
# model_engine, optimizer, _, _ = deepspeed.initialize(
#     model=model,
#     model_parameters=model.parameters(),
#     config_params=DEEPSPEED_CONFIG,
    
# )

In [None]:
for epoch in range(1, EPOCHS + 1):
    print(f"Epoch {epoch}\n-------------------------------")
    model.train()
    total_loss = 0
    for i, ((input_ids, attention_masks), labels) in enumerate(loader):
        print(f'Batch {i+1}/{len(loader)}', end='\r')
        print()
        # outputs = model_engine(input_ids=input_ids, attention_mask=attention_masks, labels=labels)
        outputs = model(input_ids=input_ids.to(DEVICE), attention_mask=attention_masks.to(DEVICE))
        loss = calculate_loss(outputs.logits, labels.to(DEVICE))
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # model_engine.backward(loss)
        # model_engine.step()
        
        torch.cuda.empty_cache()
    
    train_loss = total_loss / len(loader)
    print(f"Train loss: {train_loss:.4f}")


Epoch 1
-------------------------------
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check
check


KeyboardInterrupt: 

In [None]:
text = tokenizer.apply_chat_template(
    chat_templates[0][:-1],
    add_generation_prompt=True,
    tokenize=False,
    padding=True,
    padding_side='left',
    return_tensors='pt'
)
model_inputs = tokenizer([text], return_tensors="pt").to(DEVICE)

In [None]:
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=50
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response

"<|im_start|>system\nYou are an empathetic doctor chatbot that asks follow-up questions about the patient's symptoms and explains very briefly what the health problem might be.<|im_end|>\n<|im_start|>user\nI’ve been sneezing a lot today and my nose feels congested.<|im_end|>\n<|im_start|>assistant\n"

In [None]:
t = torch.tensor([1, 2, 3])
t2 = torch.tensor([[1, 2], [3, 4], [5, 6]])
print(t2.size(-1))

2
