In [1]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
!pip install mlflow

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 
dtype = None 
load_in_4bit = True 

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")
login(hf_token)

In [4]:
import wandb

wb_token = user_secrets.get_secret("wandb")

wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune-gemma-7b on Medical COT Dataset', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mjanvi24[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## Loading the model and tokenizer

In [30]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/kaggle/input/gemma/transformers/7b/2",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = hf_token, 
)

==((====))==  Unsloth 2025.2.12: Fast Gemma patching. Transformers: 4.49.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Unsloth: Will load /kaggle/input/gemma/transformers/7b/2 as a legacy tokenizer.


## Model inference before fine-tuning

In [6]:
prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request. 
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. 
Please answer the following medical question. 

### Question:
{}

### Response:
<think>{}"""

In [7]:
import time
s = time.time()
question = "A 61-year-old woman with a long history of involuntary urine loss during activities like coughing or sneezing but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings, what would cystometry most likely reveal about her residual volume and detrusor contractions?"

FastLanguageModel.for_inference(model)  # Unsloth has 2x faster inference!
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs)
print(response[0].split("### Response:")[1])
print("Time: ",time.time()-s)


<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>
<think>

## Lora config

In [33]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=32,
    lora_dropout=0,  
    bias="none",  
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,  
    loftq_config=None,
)

## Loading and processing the dataset

In [9]:
train_prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request. 
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. 
Please answer the following medical question. 

### Question:
{}

### Response:
<think>
{}
</think>
{}"""

In [10]:
EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    inputs = examples["Question"]
    cots = examples["Complex_CoT"]
    outputs = examples["Response"]
    texts = []
    for input, cot, output in zip(inputs, cots, outputs):
        text = train_prompt_style.format(input, cot, output) + EOS_TOKEN
        texts.append(text)
    return {
        "text": texts,
    }

In [17]:
from datasets import load_dataset
dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT","en", split = "train",trust_remote_code=True)
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset["train"]

In [22]:
train_dataset = train_dataset.map(formatting_prompts_func, batched = True,)
train_dataset["text"][0]

Map:   0%|          | 0/20296 [00:00<?, ? examples/s]

"Below is an instruction that describes a task, paired with an input that provides further context. \nWrite a response that appropriately completes the request. \nBefore answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.\n\n### Instruction:\nYou are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. \nPlease answer the following medical question. \n\n### Question:\nDescribe the work cycle and identify its four human factors components that are necessary to complete a task.\n\n### Response:\n<think>\nAlright, let's think about the whole idea of getting a task done in terms of a work cycle and what human elements come into play. We start by figuring out what the task actually is, obviously. Then there's the planning, which is like coming up with a strategy or a roadmap. The execution phase is where the rubber meets the road, and finally, we wrap it up by eva

In [36]:
train_dataset = train_dataset.select(range(8000))

## Setting up the model

In [38]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

training_args = TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        num_train_epochs = 1, # warmup_ratio for full training runs!
        warmup_steps=5,
        max_steps=5,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1, 
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        output_dir="outputs",
        save_steps=400,
        push_to_hub=True,
        hub_model_id="gemma-7b-mediqa",
        hub_token=hf_token,
        report_to = "None"        
    )

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=training_args,
)

Tokenizing train dataset (num_proc=2):   0%|          | 0/8000 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/8000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


## Model training

In [40]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 8,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 1,000
 "-____-"     Number of trainable parameters = 50,003,968
2025/02/19 07:43:37 ERROR mlflow.utils.async_logging.async_logging_queue: Run Id 29cbf116b5504eba9af38f3b83889ca6: Failed to log run data: Exception: Changing param values is not allowed. Param with key='logging_dir' was already logged with value='outputs/runs/Feb19_07-35-11_c7f8f5a84906' for run ID='29cbf116b5504eba9af38f3b83889ca6'. Attempted logging new value 'outputs/runs/Feb19_07-42-46_c7f8f5a84906'.


Step,Training Loss
100,1.2641
200,1.232
300,1.2089
400,1.1979
500,1.196
600,1.1785
700,1.1834
800,1.18
900,1.1714
1000,1.156


In [43]:
model.push_to_hub("gemma-7b-mediqa")
tokenizer.push_to_hub("gemma-7b-mediqa")

Saved model to https://huggingface.co/gemma-7b-mediqa


In [44]:
model.push_to_hub_merged("gemma-7b-mediqa-final", tokenizer, save_method = "merged_16bit")

Unsloth: You have 2 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 14.76 out of 31.35 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 29%|██▊       | 8/28 [00:00<00:01, 10.55it/s]
We will save to Disk and not RAM now.
100%|██████████| 28/28 [00:30<00:00,  1.07s/it]


Unsloth: Saving to organization with address JC-24/gemma-7b-mediqa-final
Unsloth: Saving tokenizer... Done.
Unsloth: Saving to organization with address JC-24/gemma-7b-mediqa-final
Unsloth: Saving JC-24/gemma-7b-mediqa-final/pytorch_model-00001-of-00004.bin...
Unsloth: Saving JC-24/gemma-7b-mediqa-final/pytorch_model-00002-of-00004.bin...
Unsloth: Saving JC-24/gemma-7b-mediqa-final/pytorch_model-00003-of-00004.bin...
Unsloth: Saving JC-24/gemma-7b-mediqa-final/pytorch_model-00004-of-00004.bin...
Unsloth: Uploading all files... Please wait...


  0%|          | 0/4 [00:00<?, ?it/s]

pytorch_model-00001-of-00004.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

pytorch_model-00002-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00003-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00004-of-00004.bin:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/None/gemma-7b-mediqa-final


In [45]:
# Save the fine-tuned model
# wandb.finish()

0,1
train/epoch,▁▂▃▃▄▅▆▆▇██
train/global_step,▁▂▃▃▄▅▆▆▇██
train/grad_norm,▁▁▂▁▂▁▂█▃▁
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▆▄▄▄▂▃▃▂▁

0,1
total_flos,3.132589342937825e+17
train/epoch,1.0
train/global_step,1000.0
train/grad_norm,1.08209
train/learning_rate,0.0
train/loss,1.156
train_loss,1.19681
train_runtime,26219.4105
train_samples_per_second,0.305
train_steps_per_second,0.038


## Model Inference

In [None]:
FastLanguageModel.for_inference(model)  # Unsloth has 2x faster inference!

In [46]:
import time
s = time.time()

question = "A 61-year-old woman with a long history of involuntary urine loss during activities like coughing or sneezing but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings, what would cystometry most likely reveal about her residual volume and detrusor contractions?"

inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs)
print(response[0].split("### Response:")[1])
print("\nLatency: ",time.time()-s)


<think>
Alright, let's think about this. We've got a 61-year-old woman who's been dealing with involuntary urine loss, especially when she coughs or sneezes. But she doesn't have any issues at night, which is interesting.

So, what's going on here? This sounds like stress urinary incontinence, which is pretty common in women her age. It's usually because the muscles that hold the bladder in place aren't doing their job well.

Now, she had a gynecological exam and a Q-tip test. The Q-tip test is a quick way to check if the urethra is open, and it's usually done to rule out other issues like urethral strictures.

Given her symptoms and the test results, it seems like stress urinary incontinence is the main suspect here. So, what would a cystometry test show?

Cystometry is a test that helps us understand how the bladder is working. It measures the pressure inside the bladder and tracks how much urine is left behind after a normal void.

In stress urinary incontinence, the bladder usuall

In [48]:
eval_datset = dataset["test"]

In [51]:
question = eval_datset[0]["Question"]
actual_res = eval_datset[0]["Response"]

print("Question: ", question ,"\n\n")
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs)
print("\n\nModel Response:",response[0].split("### Response:")[1])
print("\n------------------------------------------------------------------------\n")
print("\n\nActual Response:",actual_res)

Question:  A 6-year-old boy is brought to the physician by his father for an annual health maintenance examination. His father notes that he has several pigmented areas on his skin and a few fleshy bumps. He has also had some blurred vision in his left eye. He has no history of serious medical illness. He lives at home with both parents and is up-to-date on all his immunizations. His father has similar skin findings. His mother has epilepsy and glaucoma. Vital signs are within normal limits. Visual acuity testing shows 20/50 in the left eye and 20/20 in the right eye. Slit-lamp examination shows pigmented iris nodules. Examination of his skin shows eight brownish macules and numerous soft, non-tender, pedunculated lesions on the back, chest, and abdomen. Which of the following is the most appropriate next step in management?
A. CT scan of the brain
B. MRI of the brain
C. Electroencephalogram
D. B-scan ultrasound
E. Gonioscopy 



<think>
Alright, let's think about this. We've got a 6-y