# Step 1: Environment Setup

In [1]:
import os
os.environ["FLASH_ATTENTION_FORCE_DISABLED"] = "1"
os.environ["DISABLE_TRITON"] = "1"


In [2]:
%%capture

!pip install unsloth # install unsloth

In [3]:
!pip install transformers==4.51.3 trl==0.8.6 bitsandbytes accelerate --no-deps --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25h

### Verify GPU

In [4]:
!nvidia-smi # verify GPU

Mon May 19 08:47:19 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   36C    P0             27W /  250W |       0MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## Install Relevent Packages

In [5]:

# Modules for fine-tuning
from unsloth import FastLanguageModel
import torch # Import PyTorch
from trl import SFTTrainer # Trainer for supervised fine-tuning (SFT)
from unsloth import is_bfloat16_supported # Checks if the hardware supports bfloat16 precision
# Hugging Face modules
from huggingface_hub import login # Lets you login to API
from transformers import TrainingArguments # Defines training hyperparameters
from datasets import load_dataset # Lets you load fine-tuning datasets
# Import weights and biases
import wandb
# Import kaggle secrets
from kaggle_secrets import UserSecretsClient

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-05-19 08:47:31.169630: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747644451.358664      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747644451.409215      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🦥 Unsloth Zoo will now patch everything to make training faster!


# Step 2: Dataset Preparation

In [6]:
import pandas as pd
from datasets import Dataset

# Load dataset from Hugging Face
dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en")

# Convert to pandas DataFrame
df = pd.DataFrame(dataset["train"])

# Check the column names (optional debug)
print("Columns:", df.columns)
print(df.head(2))



README.md:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

medical_o1_sft.json:   0%|          | 0.00/58.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19704 [00:00<?, ? examples/s]

Columns: Index(['Question', 'Complex_CoT', 'Response'], dtype='object')
                                            Question  \
0  Given the symptoms of sudden weakness in the l...   
1  A 33-year-old woman is brought to the emergenc...   

                                         Complex_CoT  \
0  Okay, let's see what's going on here. We've go...   
1  Okay, let's figure out what's going on here. A...   

                                            Response  
0  The specific cardiac abnormality most likely t...  
1  In this scenario, the most likely anatomical s...  


## Combine columns into a formatted prompt-response format

In [7]:
# Combine columns into a formatted prompt-response format
def format_example(row):
    return {
        "text": f"### Question:\n{row['Question']}\n\n### Reasoning:\n{row['Complex_CoT']}\n\n### Answer:\n{row['Response']}"
    }

formatted_data = df.apply(format_example, axis=1)
formatted_df = pd.DataFrame(formatted_data.tolist())


## Split dataset

In [8]:
# Split dataset
val_df = formatted_df.sample(n=100, random_state=42)
train_df = formatted_df.drop(val_df.index)

# Convert to Hugging Face datasets format
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Display example
print(train_dataset[0])

{'text': "### Question:\nGiven the symptoms of sudden weakness in the left arm and leg, recent long-distance travel, and the presence of swollen and tender right lower leg, what specific cardiac abnormality is most likely to be found upon further evaluation that could explain these findings?\n\n### Reasoning:\nOkay, let's see what's going on here. We've got sudden weakness in the person's left arm and leg - and that screams something neuro-related, maybe a stroke?\n\nBut wait, there's more. The right lower leg is swollen and tender, which is like waving a big flag for deep vein thrombosis, especially after a long flight or sitting around a lot.\n\nSo, now I'm thinking, how could a clot in the leg end up causing issues like weakness or stroke symptoms?\n\nOh, right! There's this thing called a paradoxical embolism. It can happen if there's some kind of short circuit in the heart - like a hole that shouldn't be there.\n\nLet's put this together: if a blood clot from the leg somehow trave

# Step 3: Load LLaMA 3.2 (3B) & Set Fine-Tuning Strategy Using Unsloth

## 1. Load the Model (4-bit, with LoRA)

In [9]:
from kaggle_secrets import UserSecretsClient
from unsloth import FastLanguageModel
from transformers import AutoTokenizer

# Load Hugging Face token securely from Kaggle secrets
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_Tokens")
wandb_token = user_secrets.get_secret("wnb")


# Log in to Weights & Biases
import wandb
wandb.login(key=wandb_token)

# Load base model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct",  
    max_seq_length = 2048,
    dtype = None,     # Let Unsloth choose the best dtype (float16, etc.)
    load_in_4bit = True,
    token = hf_token,
)

# Prepare model for training
FastLanguageModel.for_training(model,
    use_gradient_checkpointing = True,
)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mimansha752[0m ([33mimansha752-student[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


==((====))==  Unsloth 2025.5.6: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla P100-PCIE-16GB. Num GPUs = 1. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 6.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072, padding_idx=128004)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,)

## 2. Prepare the Model for Training with LoRA

In [10]:
# Now apply PEFT (LoRA)
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,                 # LoRA Rank
    lora_alpha = 32,        # LoRA Scaling factor
    lora_dropout = 0.0,    # Dropout
    bias = "none"           # No bias tuning
)

Unsloth 2025.5.6 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


## 3. Tokenize the Dataset

In [11]:
def tokenize(example):
    tokenized = tokenizer(
        example["text"],
        truncation = True,
        padding = "max_length",
        max_length = 2048
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

In [12]:
train_dataset = train_dataset.map(tokenize)
val_dataset = val_dataset.map(tokenize)

Map:   0%|          | 0/19604 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

## 4. Set Training Arguments

In [13]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir = "llama3-medical-finetuning",  # Where the model checkpoints will be saved
    per_device_train_batch_size = 2,  # Effective batch size = 2 * gradient_accumulation_steps
    gradient_accumulation_steps = 2,  # Accumulates gradients for more stable training
    max_steps = 60,  # Small number for quick test run
    logging_steps = 1,  # Logs every step for debugging
    save_steps = 10,  # Saves model every 10 steps
    learning_rate = 2e-4,  # A good starting point for PEFT
    num_train_epochs = 1,  # Will be overridden if max_steps is reached first
    fp16 = True,  # You can turn this ON if you want mixed-precision on Colab Pro/Pro+ GPUs
    optim = "adamw_torch",  # Preferable over "paged_adamw_32bit" if that caused issues
    lr_scheduler_type = "cosine",  # Smooth learning rate curve
    warmup_steps = 5,  # Start with low LR for stability
    report_to = "wandb",  # Disable W&B
)

## 5. formatting_func for Your Dataset

In [14]:
print(val_df.columns.tolist())


['text']


In [15]:
print(val_df["text"].iloc[0])


### Question:
A 24-year-old woman has progressively worsening episodes of severe, crampy abdominal pain, nonbloody diarrhea, mild abdominal distension, and a perianal fistula draining stool. Immunohistochemistry reveals dysfunction of the nucleotide oligomerization binding domain 2 (NOD2) protein. Which immunological protein is most likely overactive due to this dysfunction?

### Reasoning:
Hmm, a 24-year-old woman is experiencing these terrible stomach issues, like severe cramps and diarrhea, plus she's got a perianal fistula that's really uncomfortable. It sounds like a tough situation.

From what I know, these symptoms kinda fit the description of Crohn's disease, doesn't it? Crohn's is one of those inflammatory bowel diseases where the immune system kinda goes haywire.

Now, let's think about this NOD2 thing she's got going on. NOD2 is an important protein that helps recognize bacteria, like a bodyguard for our gut. If dysfunctional, it can lead to problems with detecting bacteria 

## Extract Question, Reasoning, and Answer with Regex

In [16]:
import re

def extract_fields(text):
    question_match = re.search(r"### Question:\n(.+?)\n### Reasoning:", text, re.DOTALL)
    reasoning_match = re.search(r"### Reasoning:\n(.+?)\n### Answer:", text, re.DOTALL)
    answer_match = re.search(r"### Answer:\n(.+)", text, re.DOTALL)

    return {
        "Question": question_match.group(1).strip() if question_match else None,
        "Complex_CoT": reasoning_match.group(1).strip() if reasoning_match else None,
        "Response": answer_match.group(1).strip() if answer_match else None,
    }

# Apply to all rows
parsed_df = val_df["text"].apply(extract_fields).apply(pd.Series)

# Merge with original dataframe if needed
val_df = pd.concat([val_df, parsed_df], axis=1)


In [17]:
def formatting_func(example):
    question = example["Question"]
    reasoning = example["Complex_CoT"]
    response = example["Response"]

    return f"### Question:\n{question}\n\n### Reasoning:\n{reasoning}\n\n### Answer:\n{response}"


## SFTTrainer Setup 

In [18]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    args=training_args,
    tokenizer=tokenizer,
    formatting_func=formatting_func,
    packing=True,
)

## ROUGE-L Score Calculation (Before Training (Baseline Score))

In [19]:
print(val_df.columns.tolist())


['text', 'Question', 'Complex_CoT', 'Response']


In [20]:
# Install required packages
!pip install -q evaluate rouge_score

import evaluate
rouge = evaluate.load("rouge")

# Get baseline predictions
def generate_response_baseline(example):
    prompt = formatting_func(example)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=200)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)
    
# Apply to validation set
val_df["baseline_pred"] = val_df.apply(generate_response_baseline, axis=1)

# Compute ROUGE-L score
baseline_scores = rouge.compute(predictions=val_df["baseline_pred"].tolist(),
                                 references=val_df["Response"].tolist(),
                                 use_stemmer=True)
print("ROUGE-L Before Fine-Tuning:", baseline_scores["rougeL"])


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

ROUGE-L Before Fine-Tuning: 0.30526861209694733


## Training

This will:

    Start supervised fine-tuning on your dataset.

    Log metrics (e.g., loss) to the console and to Weights & Biases (since you're using report_to="wandb").

In [21]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 19,604 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 2 x 1) = 4
 "-____-"     Trainable parameters = 24,313,856/3,000,000,000 (0.81% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,10.5496
2,10.5594
3,9.9056
4,9.6075
5,7.6628
6,7.6184
7,7.0
8,6.5431
9,6.0444
10,5.657


TrainOutput(global_step=60, training_loss=5.236639833450317, metrics={'train_runtime': 1483.6775, 'train_samples_per_second': 0.162, 'train_steps_per_second': 0.04, 'total_flos': 8384528787701760.0, 'train_loss': 5.236639833450317, 'epoch': 0.012242399510304019})

## After Training (Post Fine-Tuning Score)

In [22]:
# Reload fine-tuned model (if necessary) and run predictions again
def generate_response_finetuned(example):
    prompt = formatting_func(example)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=200)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

val_df["finetuned_pred"] = val_df.apply(generate_response_finetuned, axis=1)
finetuned_scores = rouge.compute(predictions=val_df["finetuned_pred"].tolist(),
                                  references=val_df["Response"].tolist(),
                                  use_stemmer=True)
print("ROUGE-L After Fine-Tuning:", finetuned_scores["rougeL"])


ROUGE-L After Fine-Tuning: 0.3052061487262838


## save the model

In [23]:
save_path = "llama3-medical-finetuned"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)



('llama3-medical-finetuned/tokenizer_config.json',
 'llama3-medical-finetuned/special_tokens_map.json',
 'llama3-medical-finetuned/tokenizer.json')

In [26]:
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [27]:
model.push_to_hub("imranmansha/llama3-medical-finetuned")
tokenizer.push_to_hub("imranmansha/llama3-medical-finetuned")


README.md:   0%|          | 0.00/616 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/97.3M [00:00<?, ?B/s]

Saved model to https://huggingface.co/imranmansha/llama3-medical-finetuned


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]