In [1]:
%%capture
%pip install unsloth

In [2]:
import torch
print(torch.version.cuda)
print(torch.cuda.is_available())  

12.4
True


#### There are two way to download Deepseek into locally

#### Option 1: Using transformers to Download and Save Locally

In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Specify the model name
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

# Download and save the model and tokenizer locally
local_model_path = "./deepseek_r1_model"  # Path to save the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)

# Save the model and tokenizer to the local path
tokenizer.save_pretrained(local_model_path)
model.save_pretrained(local_model_path)

print(f"Model and tokenizer saved to {local_model_path}")

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Model and tokenizer saved to ./deepseek_r1_model


#### Option 2: Using huggingface_hub to Download

In [3]:
from huggingface_hub import snapshot_download

# Specify the model name
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

# Download the model to a local directory
local_model_path = "./deepseek_r1_model"
snapshot_download(repo_id=model_name, local_dir=local_model_path)

print(f"Model downloaded to {local_model_path}")

  from .autonotebook import tqdm as notebook_tqdm
Fetching 9 files: 100%|██████████| 9/9 [00:05<00:00,  1.77it/s]

Model downloaded to ./deepseek_r1_model





In [1]:
from unsloth import FastLanguageModel

# Specify the local path to the model
local_model_path = "./deepseek_r1_model"

# Load the model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=local_model_path,  # Use the local path
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)

print("Model and tokenizer loaded successfully!")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.2.15: Fast Qwen2 patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA GeForce GTX 1650. Max memory: 3.806 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Sliding Window Attention is enabled but not implemented for `eager`; unexpected results may be encountered.


./deepseek_r1_model does not have a padding token! Will use pad_token = <|vision_pad|>.
Model and tokenizer loaded successfully!


In [2]:
prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning.
Please answer the following medical question.

### Question:
{}

### Response:
<think>{}"""

#### Tokenizer : Convert the input string into list of integer tokens that would be input the LM.

In [3]:
question = "A 61-year-old woman with a long history of involuntary urine loss during activities like coughing or sneezing but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings, what would cystometry most likely reveal about her residual volume and detrusor contractions?"


FastLanguageModel.for_inference(model)
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

#### Print the shape of tensor that convert into integer list

In [4]:
# print the shape of input string after convert
print(inputs["input_ids"].shape)
# Each Integer value is represent word from input
inputs["input_ids"]

torch.Size([1, 152])


tensor([[151646,  38214,    374,    458,   7600,    429,  16555,    264,   3383,
             11,  34426,    448,    458,   1946,    429,   5707,   4623,   2266,
            624,   7985,    264,   2033,    429,  34901,  44595,    279,   1681,
            624,  10227,  35764,     11,   1744,  15516,    911,    279,   3405,
            323,   1855,    264,   3019,  14319,  29208,   8781,    315,  11303,
            311,   5978,    264,  19819,    323,  13382,   2033,    382,  14374,
          29051,    510,   2610,    525,    264,   6457,   6203,    448,  10847,
           6540,    304,  14490,  32711,     11,  49418,     11,    323,   6380,
           9115,    624,   5501,   4226,    279,   2701,   6457,   3405,    382,
          14374,  15846,    510,     32,    220,     21,     16,   4666,   6284,
           5220,    448,    264,   1293,   3840,    315,  90505,  39235,   4709,
           2337,   7488,   1075,  39600,    287,    476,  20760,  10125,    287,
            714,    902,  80

#### Let Convert it back to text (decoding)

In [5]:
tokenizer.batch_decode(inputs["input_ids"])

['<｜begin▁of▁sentence｜>Below is an instruction that describes a task, paired with an input that provides further context.\nWrite a response that appropriately completes the request.\nBefore answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.\n\n### Instruction:\nYou are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning.\nPlease answer the following medical question.\n\n### Question:\nA 61-year-old woman with a long history of involuntary urine loss during activities like coughing or sneezing but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings, what would cystometry most likely reveal about her residual volume and detrusor contractions?\n\n### Response:\n<think>']

#### Attention Mask

In [6]:
inputs['attention_mask']

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')

##### Let give the prompt to model

In [7]:
outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs)
print(response[0].split("### Response:")[1])


<think>
Okay, I need to figure out what the response should be for this medical question. The woman is 61 and has a long history of involuntary urine loss during activities but not at night. She's undergoing a gynecological exam and a Q-tip test. The question is about what cystometry would reveal regarding her residual volume and detrusor contractions.

First, let's think about the Q-tip test. The Q-tip is a dye that comes off when there's a leak. If it's not coming off, it might indicate a residual volume. But wait, sometimes the dye can stay if the leak is small. So maybe the Q-tip test is not showing a leakage, but the cystometry might.

Cystometry measures residual volume and also looks for detrusor contractions. Residual volume is the amount of fluid that remains in the lungs. If the woman has a lot of urine, that suggests a large residual volume. Detrusor contractions are when the diaphragm contracts, causing gas to come out. If the woman has these contractions, it might indicat