### Setup

In [1]:
from IPython.display import clear_output

!pip install transformers datasets trl torch huggingface-hub wandb scikit-learn bitsandbytes accelerate
clear_output(wait=False)

In [2]:
import random
import numpy as np
import torch
import gc

gc.collect()
torch.cuda.empty_cache()

SEED = 4242
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

In [3]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HF_TOKEN")
login(hf_token)

In [4]:
import wandb

wandb_api = user_secrets.get_secret("WANDB_API")
wandb.login(key=wandb_api)

run = wandb.init(
    project='Deepseek-R1-Qwen-1.5b SFT on medical dataset full 1 epoch v.0',
    job_type="training",
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmiliusha2801[0m ([33mmiliusha2801-innopolis-university[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [5]:
import torch

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)
device

'cuda'

### Model loading

In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import setup_chat_format


# bnb_config = BitsAndBytesConfig(load_in_8bit=True)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=True,
)

model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# tokenizer.pad_token = "<PAD>"
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
model.config.pad_token_id = tokenizer.eos_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

finetune_name = "DeepSeek-R1-Distill-Qwen-1.5B-Medical"

config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [7]:
print(next(model.parameters()).dtype)

torch.float16


In [8]:
system_prompt = """
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. 
Please answer the following medical question.\n"""

In [9]:
import gc

gc.collect()
torch.cuda.empty_cache()

In [10]:
prompt = "A 3-year-old child presents with tall stature, developmental delay, joint hypermobility, hyperelastic skin, fair complexion, prominent sternum, and downward lens subluxation in the right eye. Considering these features, what complication is this child most likely to develop?"

messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}]
formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False)

inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    # use_cache=True,
    pad_token_id=tokenizer.eos_token_id,
)

print("Output before training:")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Output before training:

You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. 
Please answer the following medical question.
<｜User｜>A 3-year-old child presents with tall stature, developmental delay, joint hypermobility, hyperelastic skin, fair complexion, prominent sternum, and downward lens subluxation in the right eye. Considering these features, what complication is this child most likely to develop? 
Please provide a detailed explanation of your reasoning.
</think>

The child presented with several clinical signs that could indicate a developmental delay and may suggest an underlying neurological condition. Here's a detailed analysis of the possible complication:

1. **Joint Hypermobility and Downward Lens Subluxation**: The right eye shows downward lens subluxation, a hallmark of a congenital malformation. This suggests a neurological issue, possibly a congenital malformation of the eye.

2. **Tall Stature and Developmental

### Dataset loading and preparing

In [11]:
from datasets import load_dataset

# split="train[0:500]"
ds = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en", split="train[:20000]", trust_remote_code=True)

README.md:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

medical_o1_sft.json:   0%|          | 0.00/74.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25371 [00:00<?, ? examples/s]

In [12]:
ds

Dataset({
    features: ['Question', 'Complex_CoT', 'Response'],
    num_rows: 20000
})

In [13]:
ds[0]

{'Question': 'A 61-year-old woman with a long history of involuntary urine loss during activities like coughing or sneezing but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings, what would cystometry most likely reveal about her residual volume and detrusor contractions?',
 'Complex_CoT': "Okay, let's think about this step by step. There's a 61-year-old woman here who's been dealing with involuntary urine leakages whenever she's doing something that ups her abdominal pressure like coughing or sneezing. This sounds a lot like stress urinary incontinence to me. Now, it's interesting that she doesn't have any issues at night; she isn't experiencing leakage while sleeping. This likely means her bladder's ability to hold urine is fine when she isn't under physical stress. Hmm, that's a clue that we're dealing with something related to pressure rather than a bladder muscle problem. \n\nThe fact that she underwent a Q-tip test is intriguing too. This 

In [14]:
train_prompt_style = """
### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. 
Please answer the following medical question. 

### Question:
{}

### Response:
<think>
{}
</think>
{}
"""

In [15]:
EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    questions = examples["Question"]
    thoughts = examples["Complex_CoT"]
    responses = examples["Response"]
    texts = []
    for question, thought, response in zip(questions, thoughts, responses):
        text = train_prompt_style.format(question, thought, response) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

In [16]:
ds_formatted = ds.map(
    formatting_prompts_func,
    batched=True,
    remove_columns=["Question", "Complex_CoT", "Response"]
)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [17]:
ds_formatted[0]["text"]

"\n### Instruction:\nYou are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. \nPlease answer the following medical question. \n\n### Question:\nA 61-year-old woman with a long history of involuntary urine loss during activities like coughing or sneezing but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings, what would cystometry most likely reveal about her residual volume and detrusor contractions?\n\n### Response:\n<think>\nOkay, let's think about this step by step. There's a 61-year-old woman here who's been dealing with involuntary urine leakages whenever she's doing something that ups her abdominal pressure like coughing or sneezing. This sounds a lot like stress urinary incontinence to me. Now, it's interesting that she doesn't have any issues at night; she isn't experiencing leakage while sleeping. This likely means her bladder's ability to hold urine is fine when she isn't under physic

In [18]:
from datasets import *

ds_splitted = ds_formatted.train_test_split(test_size=0.05, seed=SEED)

In [19]:
ds_splitted

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 19000
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1000
    })
})

### Setup training config

In [20]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

In [21]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model.gradient_checkpointing_enable()

In [22]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir=finetune_name,
    eval_strategy="steps",
    eval_steps=100,
    logging_steps=100,
    save_steps=200,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    # optim="paged_adamw_32bit",
    optim="adamw_torch_fused", 
    lr_scheduler_type="cosine",
    warmup_steps=300,
    learning_rate=1e-4,
    num_train_epochs=1,
    weight_decay=0.01,
    seed=SEED,
    report_to="wandb",
    fp16=True,
    bf16=False,
    tf32=False,
    hub_model_id=finetune_name,
    gradient_checkpointing=True
)

In [23]:
from trl import SFTConfig, SFTTrainer

trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    train_dataset=ds_splitted["train"],
    eval_dataset=ds_splitted["test"],
    peft_config=peft_config,
    args=args,
)

Converting train dataset to ChatML:   0%|          | 0/19000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/19000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/19000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/19000 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/1000 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

### Train model

In [24]:
import gc

gc.collect()
torch.cuda.empty_cache()

In [25]:
trainer.train()



Step,Training Loss,Validation Loss
100,8.631,1.872467
200,7.3708,1.779906
300,7.1969,1.744969
400,6.9716,1.722984
500,6.9826,1.708936
600,6.8707,1.697711
700,6.8017,1.688394
800,6.8036,1.681025
900,6.7167,1.674167
1000,6.8052,1.668206


TrainOutput(global_step=2375, training_loss=6.8573771458675985, metrics={'train_runtime': 39914.7107, 'train_samples_per_second': 0.476, 'train_steps_per_second': 0.06, 'total_flos': 1.333256288507351e+17, 'train_loss': 6.8573771458675985})

### Save fine-tuned adapter and merged model on HF Hub

In [26]:
trainer.push_to_hub()

adapter_model.safetensors:   0%|          | 0.00/73.9M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.56k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/MilyaShams/DeepSeek-R1-Distill-Qwen-1.5B-Medical/commit/0a291a7717383b2998fc1152905566f2f170890f', commit_message='End of training', commit_description='', oid='0a291a7717383b2998fc1152905566f2f170890f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/MilyaShams/DeepSeek-R1-Distill-Qwen-1.5B-Medical', endpoint='https://huggingface.co', repo_type='model', repo_id='MilyaShams/DeepSeek-R1-Distill-Qwen-1.5B-Medical'), pr_revision=None, pr_num=None)

In [32]:
merged_model = trainer.model.merge_and_unload()

merged_model.push_to_hub(trainer.args.hub_model_id, use_temp_dir=False)
trainer.tokenizer.push_to_hub(trainer.args.hub_model_id, use_temp_dir=False)



model.safetensors:   0%|          | 0.00/2.54G [00:00<?, ?B/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/MilyaShams/DeepSeek-R1-Distill-Qwen-1.5B-Medical/commit/4c7fcd8369053a012c2b53e977f30eecfa2b9e26', commit_message='Upload tokenizer', commit_description='', oid='4c7fcd8369053a012c2b53e977f30eecfa2b9e26', pr_url=None, repo_url=RepoUrl('https://huggingface.co/MilyaShams/DeepSeek-R1-Distill-Qwen-1.5B-Medical', endpoint='https://huggingface.co', repo_type='model', repo_id='MilyaShams/DeepSeek-R1-Distill-Qwen-1.5B-Medical'), pr_revision=None, pr_num=None)

In [33]:
import gc

gc.collect()
torch.cuda.empty_cache()

In [None]:
from transformers import pipeline

question = "What type of cement bonds to tooth structure, provides an anticariogenic effect, has a degree of translucency, and is non-irritating to the pulp?"
generator = pipeline("text-generation", model="MilyaShams/DeepSeek-R1-Distill-Qwen-1.5B-Medical", device="cuda")
output = generator([{"role": "user", "content": question}], max_new_tokens=2000, return_full_text=False)[0]
print(output["generated_text"])