In [2]:
import pandas as pd

In [3]:
df =pd.read_csv('train_data_chatbot.csv')

In [4]:
import json
formatted_data = []
for _, row in df.iterrows():
    instruction = f"You are a medical assistant. Answer the following question based on the topic: {row['tags']}.\n\nQuestion: {row['short_question']}"
    formatted_data.append({
        "instruction": instruction,
        "input": "",
        "output": row["short_answer"]
    })

with open("formatted_dataset.json", "w") as f:
    for item in formatted_data:
        f.write(json.dumps(item) + "\n")

In [5]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import BitsAndBytesConfig
import torch

dataset = load_dataset("json", data_files="formatted_dataset.json", split="train")




  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 47603 examples [00:00, 759819.06 examples/s]


In [None]:
from huggingface_hub import login
login(token="*******") # Replace with your own token

In [7]:
model_id = "meta-llama/Llama-3.2-3B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.30s/it]


In [8]:
model = prepare_model_for_kbit_training(model)

# Apply LoRA config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

In [9]:
# Tokenize the dataset
def tokenize(example):
    text = f"""### Instruction:
{example['instruction']}

### Input:
{example['input']}

### Response:
{example['output']}"""

    result = tokenizer(text, truncation=True, padding="max_length", max_length=512)
    result["labels"] = result["input_ids"].copy()
    return result

tokenized_dataset = dataset.map(tokenize)

Map: 100%|██████████| 47603/47603 [00:23<00:00, 2032.18 examples/s]


In [10]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./llama3-medchatbot",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-5,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    report_to="none"
)


In [11]:
# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [12]:
# Trainer
trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset,
    args=training_args,
    data_collator=data_collator,
)

In [13]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
10,3.4481
20,3.3006
30,3.3001
40,3.0512
50,3.0908
60,2.9908
70,2.6756
80,2.6828
90,2.5772
100,2.4145


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=17850, training_loss=2.1762168715447605, metrics={'train_runtime': 106735.6338, 'train_samples_per_second': 1.338, 'train_steps_per_second': 0.167, 'total_flos': 1.2384583436835226e+18, 'train_loss': 2.1762168715447605, 'epoch': 2.9995798672380474})