In [12]:
%%capture
#  unsloth
!pip install unsloth # install unsloth
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git # Also get the latest version Unsloth!

!pip uninstall -y torch torchvision
!pip install torch torchvision

In [13]:
!pip list | grep unsloth

unsloth                            2025.2.12
unsloth_zoo                        2025.2.5


# Imports

In [20]:
# Modules for fine-tuning
from unsloth import FastLanguageModel
import torch # Import PyTorch
from trl import SFTTrainer # Trainer for supervised fine-tuning (SFT)
from unsloth import is_bfloat16_supported # Checks if the hardware supports bfloat16 precision
# Hugging Face modules
from huggingface_hub import login # Lets you login to API
from transformers import TrainingArguments # Defines training hyperparameters
from datasets import load_dataset # Lets you load fine-tuning datasets
# Import weights and biases
import wandb
# Import kaggle secrets
from kaggle_secrets import UserSecretsClient

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


# HF and WandB tokens

In [21]:
# Initialize Hugging Face & WnB tokens
user_secrets = UserSecretsClient() # from kaggle_secrets import UserSecretsClient
hugging_face_token = user_secrets.get_secret("HF_token")
wnb_token = user_secrets.get_secret("wandb_token")

# Login to Hugging Face
login(hugging_face_token) # from huggingface_hub import login

# Login to WnB
wandb.login(key=wnb_token) # import wandb
run = wandb.init(
    project='Fine-tune-DeepSeek-R1-Distill-Llama-8B on Medical COT Dataset_YouTube Walkthrough', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmaxwelljohn123123[0m ([33mmaxwelljohn123123-student[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


# Load deepseek and tokenizer

In [23]:
max_seq_length = 2048
dtype = None
load_in_4bit = True

In [24]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/DeepSeek-R1-Distill-Llama-8B",  # Load the pre-trained DeepSeek R1 model (8B parameter version)
    max_seq_length=max_seq_length, # Ensure the model can process up to 2048 tokens at once
    dtype=dtype, # Use the default data type (e.g., FP16 or BF16 depending on hardware support)
    load_in_4bit=load_in_4bit, # Load the model in 4-bit quantization to save memory
    token=hugging_face_token, # Use hugging face token
)

==((====))==  Unsloth 2025.2.12: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [25]:
prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request. 
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. 
Please answer the following medical question. 

### Question:
{}

### Response:
<think>{}"""

In [27]:
# Creating a test medical question for inference
question = """A 61-year-old woman with a long history of involuntary urine loss during activities like coughing or 
              sneezing but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings, 
              what would cystometry most likely reveal about her residual volume and detrusor contractions?"""
# Load model to unsloth fast inference
FastLanguageModel.for_inference(model)

# input tokenization
inputs = tokenizer([prompt_style.format(question, "")], return_tensors = 'pt').to('cuda')

# generate response
outputs  = model.generate(
    input_ids = inputs.input_ids,
    attention_mask = inputs.attention_mask,
    max_new_tokens = 1200,
    use_cache = True,
)

# decode response
response = tokenizer.batch_decode(outputs)

print(response[0].split("### Response:")[1])


<think>
Okay, so I have this medical question about a 61-year-old woman with a history of involuntary urine loss when she coughs or sneezes, but she doesn't leak at night. She's had a gynecological exam and a Q-tip test, and the question is asking what cystometry would most likely show regarding her residual volume and detrusor contractions.

First, I need to break down the information. Involuntary urine loss during activities like coughing or sneezing suggests a possible issue with the lower urinary tract, specifically the bladder. The fact that she doesn't leak at night points towards a possible daytime issue, maybe something related to bladder capacity or control.

Now, she undergoes a gynecological exam and a Q-tip test. The Q-tip test is commonly used to assess urethral function, especially in women. It involves inserting a Q-tip catheter into the urethra and measuring the pressure at the urethral meatus. If the pressure is higher than normal, it might indicate an obstruction or 

# Fine tuning

In [28]:
# Updated training prompt style to add </think> tag 
train_prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request. 
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. 
Please answer the following medical question. 

### Question:
{}

### Response:
<think>
{}
</think>
{}"""

## Load dataset

In [29]:
dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en", split = "train[0:500]", trust_remote_code = True)

dataset

README.md:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

medical_o1_sft.json:   0%|          | 0.00/74.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25371 [00:00<?, ? examples/s]

Dataset({
    features: ['Question', 'Complex_CoT', 'Response'],
    num_rows: 500
})

In [30]:
dataset[1]

{'Question': 'A 45-year-old man with a history of alcohol use, who has been abstinent for the past 10 years, presents with sudden onset dysarthria, shuffling gait, and intention tremors. Given this clinical presentation and history, what is the most likely diagnosis?',
 'Complex_CoT': "Alright, let’s break this down. We have a 45-year-old man here, who suddenly starts showing some pretty specific symptoms: dysarthria, shuffling gait, and those intention tremors. This suggests something's going wrong with motor control, probably involving the cerebellum or its connections.\n\nNow, what's intriguing is that he's had a history of alcohol use, but he's been off it for the past 10 years. Alcohol can do a number on the cerebellum, leading to degeneration, and apparently, the effects can hang around or even appear long after one stops drinking.\n\nAt first glance, these symptoms look like they could be some kind of chronic degeneration, maybe something like alcoholic cerebellar degeneration, 

In [31]:
EOS_TOKEN = tokenizer.eos_token
EOS_TOKEN

'<｜end▁of▁sentence｜>'

In [36]:
# Define formatting prompt function
def formatting_prompts_func(examples):  
    inputs = examples["Question"]   
    cots = examples["Complex_CoT"] 
    outputs = examples["Response"] 
    
    texts = [] 
    
    
    for input, cot, output in zip(inputs, cots, outputs):  
        text = train_prompt_style.format(input, cot, output) + EOS_TOKEN
        texts.append(text) 

    return {
        "text": texts, 
    }

In [37]:
dataset_finetune = dataset.map(formatting_prompts_func, batched = True)
dataset_finetune["text"][0]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

"Below is an instruction that describes a task, paired with an input that provides further context. \nWrite a response that appropriately completes the request. \nBefore answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.\n\n### Instruction:\nYou are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. \nPlease answer the following medical question. \n\n### Question:\nA 61-year-old woman with a long history of involuntary urine loss during activities like coughing or sneezing but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings, what would cystometry most likely reveal about her residual volume and detrusor contractions?\n\n### Response:\n<think>\nOkay, let's think about this step by step. There's a 61-year-old woman here who's been dealing with involuntary urine leakages whenever she's doing something that ups her ab

## LoRA

In [48]:
FastLanguageModel.for_training(model)
model_lora = FastLanguageModel.get_peft_model(
    model, 
    r = 16,
    target_modules = [
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 1111,
    use_rslora = False,
    loftq_config = None,
)

In [49]:
trainer = SFTTrainer(
    model=model_lora, 
    tokenizer=tokenizer, 
    train_dataset=dataset_finetune, 
    dataset_text_field="text", 
    max_seq_length=max_seq_length,
    dataset_num_proc=2, 
    
    args=TrainingArguments(
        per_device_train_batch_size=2,  
        gradient_accumulation_steps=4,
        num_train_epochs=1, 
        warmup_steps=5,  
        max_steps=60,  
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),  
        bf16=is_bfloat16_supported(),
        logging_steps=10,  
        optim="adamw_8bit", 
        weight_decay=0.01,  
        lr_scheduler_type="linear",
        seed=3407, 
        output_dir="outputs", 
    ),
)

Converting train dataset to ChatML (num_proc=2):   0%|          | 0/500 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=2):   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/500 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [50]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 500 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
10,1.9192
20,1.4619
30,1.4031
40,1.3102
50,1.3446
60,1.3139


In [51]:
wandb.finish()

0,1
train/epoch,▁▂▄▅▇██
train/global_step,▁▂▄▅▇██
train/grad_norm,█▁▂▁▁▁
train/learning_rate,█▇▅▄▂▁
train/loss,█▃▂▁▁▁

0,1
total_flos,1.8014312853602304e+16
train/epoch,0.96
train/global_step,60.0
train/grad_norm,0.25876
train/learning_rate,0.0
train/loss,1.3139
train_loss,1.45884
train_runtime,1251.7514
train_samples_per_second,0.383
train_steps_per_second,0.048


In [52]:
question = """A 61-year-old woman with a long history of involuntary urine loss during activities like coughing or sneezing 
              but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings, 
              what would cystometry most likely reveal about her residual volume and detrusor contractions?"""


FastLanguageModel.for_inference(model_lora) 


inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")


outputs = model_lora.generate(
    input_ids=inputs.input_ids,     
    attention_mask=inputs.attention_mask, 
    max_new_tokens=1200,            
    use_cache=True,                      
)

response = tokenizer.batch_decode(outputs)

print(response[0].split("### Response:")[1])


<think>
Okay, so let's think about this. We have a 61-year-old woman who's been dealing with involuntary urine loss during things like coughing or sneezing, but she's not leaking at night. That suggests she might have some kind of problem with her pelvic floor muscles or maybe her bladder.

Now, she's got a gynecological exam and a Q-tip test. Let's break that down. The Q-tip test is usually used to check for urethral obstruction. If it's positive, that means there's something blocking the urethra, like a urethral stricture or something else.

Given that she's had a positive Q-tip test, it's likely there's a blockage in her urethra. If there's a urethral stricture, that could lead to problems with urination, like difficulty starting to urinate or having a weak stream. But in her case, it's more about the involuntary loss when she coughs or sneezes.

So, considering her symptoms and the Q-tip test, we can imagine that she might have a urethral obstruction. This could be due to a strict

In [53]:
new_model_local = "DeepSeek-R1-Medical-COT"
model_lora.save_pretrained(new_model_local) 
tokenizer.save_pretrained(new_model_local)

model_lora.save_pretrained_merged(new_model_local, tokenizer, save_method = "merged_16bit",)

Unsloth: You have 2 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 6.0G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 17.32 out of 31.35 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 34%|███▍      | 11/32 [00:00<00:01, 14.62it/s]
We will save to Disk and not RAM now.
100%|██████████| 32/32 [00:25<00:00,  1.25it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving DeepSeek-R1-Medical-COT/pytorch_model-00001-of-00004.bin...
Unsloth: Saving DeepSeek-R1-Medical-COT/pytorch_model-00002-of-00004.bin...
Unsloth: Saving DeepSeek-R1-Medical-COT/pytorch_model-00003-of-00004.bin...
Unsloth: Saving DeepSeek-R1-Medical-COT/pytorch_model-00004-of-00004.bin...
Done.


In [55]:
new_model_online = "maxwellwe/DeepSeek-R1-distill-Llama8b-Medical-COT"
model_lora.push_to_hub(new_model_online)
tokenizer.push_to_hub(new_model_online)

model_lora.push_to_hub_merged(new_model_online, tokenizer, save_method = "merged_16bit")

README.md:   0%|          | 0.00/628 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Saved model to https://huggingface.co/maxwellwe/DeepSeek-R1-distill-Llama8b-Medical-COT


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Unsloth: You are pushing to hub in Kaggle environment.
To save memory, we shall move maxwellwe/DeepSeek-R1-distill-Llama8b-Medical-COT to /tmp/DeepSeek-R1-distill-Llama8b-Medical-COT
Unsloth: Will remove a cached repo with size 1.6K


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 17.38 out of 31.35 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 32/32 [00:24<00:00,  1.28it/s]


Unsloth: Saving tokenizer...

No files have been modified since last commit. Skipping to prevent empty commit.


 Done.
Unsloth: Saving /tmp/DeepSeek-R1-distill-Llama8b-Medical-COT/pytorch_model-00001-of-00004.bin...
Unsloth: Saving /tmp/DeepSeek-R1-distill-Llama8b-Medical-COT/pytorch_model-00002-of-00004.bin...
Unsloth: Saving /tmp/DeepSeek-R1-distill-Llama8b-Medical-COT/pytorch_model-00003-of-00004.bin...
Unsloth: Saving /tmp/DeepSeek-R1-distill-Llama8b-Medical-COT/pytorch_model-00004-of-00004.bin...


  0%|          | 0/4 [00:00<?, ?it/s]

pytorch_model-00002-of-00004.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

pytorch_model-00001-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00003-of-00004.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

pytorch_model-00004-of-00004.bin:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/maxwellwe/DeepSeek-R1-distill-Llama8b-Medical-COT
