In [None]:
%%capture

!pip install unsloth # install unsloth
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git # Also get the latest version Unsloth!


#for dataset cells
!pip install spacy
!pip install datasets
!pip install torch
!pip install transformers
!python -m spacy download en_core_web_sm
!pip install transformers accelerate bitsandbytes

In [None]:
import pprint

In [None]:
# Modules for fine-tuning
# from unsloth import FastLanguageModel
# import torch # Import PyTorch
# from trl import SFTTrainer # Trainer for supervised fine-tuning (SFT)
# from unsloth import is_bfloat16_supported # Checks if the hardware supports bfloat16 precision
# # Hugging Face modules
# from huggingface_hub import login # Lets you login to API
# from transformers import TrainingArguments # Defines training hyperparameters
from datasets import load_dataset, Dataset, concatenate_datasets # Lets you load fine-tuning datasets
import json
# Import weights and biases
import wandb
# Import kaggle secrets
from google.colab import userdata

In [None]:
def transform_MC1_dataset():
    try:
        # Load dataset from Hugging Face
        dataset = load_dataset("bigbio/med_qa")
        transformed_data_MC1 = []
        for item in concatenate_datasets([dataset["train"], dataset["validation"], dataset["test"]]):
            # Ensure only English questions are kept
            #if item["language"] == "english":
                transformed_item = {
                    "correct_answer": item["answer_idx"],  # Convert index to A/B/C/D format
                    "options": {  # Extract only the values from option dictionary
                        "A": item["options"][0]["value"],
                        "B": item["options"][1]["value"],
                        "C": item["options"][2]["value"],
                        "D": item["options"][3]["value"],
                        "E": item["options"][4]["value"]
                    },
                    "question": item["question"],
                    "source": {
                        "isbn": "000-0000000000",
                        "page": 0,
                        "paragraph_id": "000-0000000000-p00-para00"
                    },
                    "type": "multiple_choice"
                }
                transformed_data_MC1.append(transformed_item)
        return transformed_data_MC1
    except Exception as e:
        print(f"Unexpected error: {e}")
transformed_MC1_data = transform_MC1_dataset()
print(json.dumps(transformed_MC1_data[:3], indent=4))

[
    {
        "correct_answer": "E",
        "options": {
            "A": "Ampicillin",
            "B": "Ceftriaxone",
            "C": "Ciprofloxacin",
            "D": "Doxycycline",
            "E": "Nitrofurantoin"
        },
        "question": "A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7\u00b0F (36.5\u00b0C), blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air. Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. Which of the following is the best treatment for this patient?",
        "source": {
            "isbn": "000-0000000000",
            "page": 0,
            "paragraph_id": "000-0000000000-p00-para00"
        },
 

In [None]:
# Initialize Hugging Face & WnB tokens
hugging_face_token = userdata.get('HF')
wnb_token = userdata.get('WAND')


# Login to Hugging Face
login(hugging_face_token) # from huggingface_hub import login

# Login to WnB
wandb.login(key=wnb_token) # import wandb
run = wandb.init(
    project='DeepSeek-R1-Distill-Llama-8B baseline model for ClinIQ',
    job_type="validation",
    anonymous="allow"
)

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkonstantinwehmeyer[0m ([33mkonstantinwehmeyer-university-of-st-gallen-student-union[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "deepseek-ai/deepseek-coder-7b-instruct"  # or any other DeepSeek model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [None]:
# Define a system prompt under prompt_style
prompt_style = """
### Instruction:

Please choose the correct answer out of the options A, B, C, D or E. Your answer MUST consist of exactly one JSON object.

{"answer": A}

If the correct answer is option A.

DO NOT allow any explanations, comments, or additional text.

---------

### Question:
"{_question_var_}"

### Answer:
{}"""

In [None]:
i = 0

options = 'A. ' + transformed_MC1_data[i].get('options').get('A') + ' B. ' + transformed_MC1_data[i].get('options').get('B') + ' C. ' + transformed_MC1_data[i].get('options').get('C') + ' D. ' + transformed_MC1_data[i].get('options').get('D') + ' E. ' + transformed_MC1_data[i].get('options').get('E')

question = transformed_MC1_data[i].get('question') + ' Options: ' + options
pprint.pp(question)

('A 23-year-old pregnant woman at 22 weeks gestation presents with burning '
 'upon urination. She states it started 1 day ago and has been worsening '
 'despite drinking more water and taking cranberry extract. She otherwise '
 'feels well and is followed by a doctor for her pregnancy. Her temperature is '
 '97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is 80/min, '
 'respirations are 19/min, and oxygen saturation is 98% on room air. Physical '
 'exam is notable for an absence of costovertebral angle tenderness and a '
 'gravid uterus. Which of the following is the best treatment for this '
 'patient? Options: A. Ampicillin B. Ceftriaxone C. Ciprofloxacin D. '
 'Doxycycline E. Nitrofurantoin')


In [None]:
def build_prompt(question: str):
  return prompt_style.replace("{_question_var_}", question)



In [None]:
pprint.pp(build_prompt(question))

('\n'
 '### Instruction:\n'
 '\n'
 'Please choose the correct answer out of the options A, B, C, D or E. Your '
 'answer MUST consist of exactly one JSON object.\n'
 '\n'
 '{"answer": A}\n'
 '\n'
 'If the correct answer is option A.\n'
 '\n'
 'DO NOT allow any explanations, comments, or additional text.\n'
 '\n'
 '---------\n'
 '\n'
 '### Question:\n'
 '"A 23-year-old pregnant woman at 22 weeks gestation presents with burning '
 'upon urination. She states it started 1 day ago and has been worsening '
 'despite drinking more water and taking cranberry extract. She otherwise '
 'feels well and is followed by a doctor for her pregnancy. Her temperature is '
 '97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is 80/min, '
 'respirations are 19/min, and oxygen saturation is 98% on room air. Physical '
 'exam is notable for an absence of costovertebral angle tenderness and a '
 'gravid uterus. Which of the following is the best treatment for this '
 'patient? Options: A. Ampicillin B. C

In [None]:
! pip install transformers==4.37.2


Collecting transformers==4.37.2
  Downloading transformers-4.37.2-py3-none-any.whl.metadata (129 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/129.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m122.9/129.4 kB[0m [31m4.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.4/129.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers==4.37.2)
  Downloading tokenizers-0.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.37.2-py3-none-any.whl (8.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[

In [None]:
import transformers
print(transformers.__version__)


4.37.2


In [None]:
from transformers import pipeline

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
response = pipe(build_prompt(question))

Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


In [None]:
pprint.pp(response)

[{'generated_text': '\n'
                    '### Instruction:\n'
                    '\n'
                    'Please choose the correct answer out of the options A, B, '
                    'C, D or E. Your answer MUST consist of exactly one JSON '
                    'object.\n'
                    '\n'
                    '{"answer": A}\n'
                    '\n'
                    'If the correct answer is option A.\n'
                    '\n'
                    'DO NOT allow any explanations, comments, or additional '
                    'text.\n'
                    '\n'
                    '---------\n'
                    '\n'
                    '### Question:\n'
                    '"A 23-year-old pregnant woman at 22 weeks gestation '
                    'presents with burning upon urination. She states it '
                    'started 1 day ago and has been worsening despite drinking '
                    'more water and taking cranberry extract. She otherwise '
      

In [None]:
# Enable optimized inference mode for Unsloth models (improves speed and efficiency)
FastLanguageModel.for_inference(model)

# Format the question using the structured prompt (`prompt_style`) and tokenize it
inputs = tokenizer([build_prompt(question)], return_tensors="pt").to("cuda")  # Convert input to PyTorch tensor & move to GPU

# Generate a response using the model
outputs = model.generate(
    input_ids=inputs.input_ids, # Tokenized input question
    attention_mask=inputs.attention_mask, # Attention mask to handle padding
    max_new_tokens=200, # Limit response length to 10 tokens (to prevent excessive output)
    # logits_processor=logits_processor,
    use_cache=True, # Enable caching for faster inference
)

# Decode the generated output tokens into human-readable text
response = tokenizer.batch_decode(outputs)
pprint.pp(response)

['<｜begin▁of▁sentence｜>\n'
 '### Instruction:\n'
 '\n'
 'Please choose the correct answer out of the options A, B, C, D or E. Your '
 'answer MUST consist of exactly one JSON object.\n'
 '\n'
 '{"answer": A}\n'
 '\n'
 'If the correct answer is option A.\n'
 '\n'
 'DO NOT allow any explanations, comments, or additional text.\n'
 '\n'
 '---------\n'
 '\n'
 '### Question:\n'
 '"A 23-year-old pregnant woman at 22 weeks gestation presents with burning '
 'upon urination. She states it started 1 day ago and has been worsening '
 'despite drinking more water and taking cranberry extract. She otherwise '
 'feels well and is followed by a doctor for her pregnancy. Her temperature is '
 '97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is 80/min, '
 'respirations are 19/min, and oxygen saturation is 98% on room air. Physical '
 'exam is notable for an absence of costovertebral angle tenderness and a '
 'gravid uterus. Which of the following is the best treatment for this '
 'patient? Option

In [None]:
import re

text = letter
match = re.search(r'\b[A-D]\b', text)

if match:
    print("First choice found:", match.group())  # prints B
else:
    print("No choice found")


First choice found: A


In [None]:
predictions = []

for i in range(5):
  options = 'A. ' + transformed_MC1_data[i].get('options').get('A') + ' B. ' + transformed_MC1_data[i].get('options').get('B') + ' C. ' + transformed_MC1_data[i].get('options').get('C') + ' D. ' + transformed_MC1_data[i].get('options').get('D') + ' E. ' + transformed_MC1_data[i].get('options').get('E')

  question = transformed_MC1_data[i].get('question') + ' Options: ' + options

  # Format the question using the structured prompt (`prompt_style`) and tokenize it
  inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")  # Convert input to PyTorch tensor & move to GPU

  # Generate a response using the model
  outputs = model.generate(
      input_ids=inputs.input_ids, # Tokenized input question
      attention_mask=inputs.attention_mask, # Attention mask to handle padding
      max_new_tokens=50, # Limit response length to 200 tokens (to prevent excessive output)
      # logits_processor=logits_processor,
      use_cache=True, # Enable caching for faster inference
  )

  # Decode the generated output tokens into human-readable text
  response = tokenizer.batch_decode(outputs)
  predictions.append(response)


In [None]:
pprint.pp(predictions)

[['<｜begin▁of▁sentence｜>\n'
  '### Instruction:\n'
  'You are a medical expert with advanced knowledge in clinical reasoning, '
  'diagnostics, and treatment planning.\n'
  'Select one of the medical answer options A, B, C, D or E based on your best '
  'knowledge. Provide exactly one answer.\n'
  '\n'
  'This is a medical example question with five options:\n'
  '\n'
  '---------\n'
  '\n'
  'Question: A 23-year-old pregnant woman at 22 weeks gestation presents with '
  'burning upon urination. She states it started 1 day ago and has been '
  'worsening despite drinking more water and taking cranberry extract. She '
  'otherwise feels well and is followed by a doctor for her pregnancy. Her '
  'temperature is 97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is '
  '80/min, respirations are 19/min, and oxygen saturation is 98% on room air. '
  'Physical exam is notable for an absence of costovertebral angle tenderness '
  'and a gravid uterus. Which of the following is the best tr

In [None]:
# Enable optimized inference mode for Unsloth models (improves speed and efficiency)
FastLanguageModel.for_inference(model)  # Unsloth has 2x faster inference!

# Format the question using the structured prompt (`prompt_style`) and tokenize it
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")  # Convert input to PyTorch tensor & move to GPU

# Generate a response using the model
outputs = model.generate(
    input_ids=inputs.input_ids, # Tokenized input question
    attention_mask=inputs.attention_mask, # Attention mask to handle padding
    max_new_tokens=3, # Limit response length to 200 tokens (to prevent excessive output)
    use_cache=True, # Enable caching for faster inference
)

# Decode the generated output tokens into human-readable text
response = tokenizer.batch_decode(outputs)

# Extract and print only the relevant response part (after "### Response:")
print(response[0].split("### Answer (Letter only):")[1])


The correct answer


In [None]:
# We need to format the dataset to fit our prompt training style
EOS_TOKEN = tokenizer.eos_token  # Define EOS_TOKEN which the model when to stop generating text during training
EOS_TOKEN

'<｜end▁of▁sentence｜>'

In [None]:
# Define formatting prompt function
def formatting_prompts_func(examples):  # Takes a batch of dataset examples as input
    inputs = examples["Question"]       # Extracts the medical question from the dataset
    cots = examples["Complex_CoT"]      # Extracts the chain-of-thought reasoning (logical step-by-step explanation)
    outputs = examples["Response"]      # Extracts the final model-generated response (answer)

    texts = []  # Initializes an empty list to store the formatted prompts

    # Iterate over the dataset, formatting each question, reasoning step, and response
    for input, cot, output in zip(inputs, cots, outputs):
        text = train_prompt_style.format(input, cot, output) + EOS_TOKEN  # Insert values into prompt template & append EOS token
        texts.append(text)  # Add the formatted text to the list

    return {
        "text": texts,  # Return the newly formatted dataset with a "text" column containing structured prompts
    }

In [None]:
# Update dataset formatting
dataset_finetune = dataset.map(formatting_prompts_func, batched = True)
dataset_finetune["text"][0]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

"Below is an instruction that describes a task, paired with an input that provides further context. \nWrite a response that appropriately completes the request. \nBefore answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.\n\n### Instruction:\nYou are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. \nPlease answer the following medical question. \n\n### Question:\nA 61-year-old woman with a long history of involuntary urine loss during activities like coughing or sneezing but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings, what would cystometry most likely reveal about her residual volume and detrusor contractions?\n\n### Response:\n<think>\nOkay, let's think about this step by step. There's a 61-year-old woman here who's been dealing with involuntary urine leakages whenever she's doing something that ups her ab

In [None]:
# Apply LoRA (Low-Rank Adaptation) fine-tuning to the model
model_lora = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank: Determines the size of the trainable adapters (higher = more parameters, lower = more efficiency)
    target_modules=[  # List of transformer layers where LoRA adapters will be applied
        "q_proj",   # Query projection in the self-attention mechanism
        "k_proj",   # Key projection in the self-attention mechanism
        "v_proj",   # Value projection in the self-attention mechanism
        "o_proj",   # Output projection from the attention layer
        "gate_proj",  # Used in feed-forward layers (MLP)
        "up_proj",    # Part of the transformer’s feed-forward network (FFN)
        "down_proj",  # Another part of the transformer’s FFN
    ],
    lora_alpha=16,  # Scaling factor for LoRA updates (higher values allow more influence from LoRA layers)
    lora_dropout=0,  # Dropout rate for LoRA layers (0 means no dropout, full retention of information)
    bias="none",  # Specifies whether LoRA layers should learn bias terms (setting to "none" saves memory)
    use_gradient_checkpointing="unsloth",  # Saves memory by recomputing activations instead of storing them (recommended for long-context fine-tuning)
    random_state=3407,  # Sets a seed for reproducibility, ensuring the same fine-tuning behavior across runs
    use_rslora=False,  # Whether to use Rank-Stabilized LoRA (disabled here, meaning fixed-rank LoRA is used)
    loftq_config=None,  # Low-bit Fine-Tuning Quantization (LoFTQ) is disabled in this configuration
)

Unsloth 2025.3.9 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
# Initialize the fine-tuning trainer — Imported using from trl import SFTTrainer
trainer = SFTTrainer(
    model=model_lora,  # The model to be fine-tuned
    tokenizer=tokenizer,  # Tokenizer to process text inputs
    train_dataset=dataset_finetune,  # Dataset used for training
    dataset_text_field="text",  # Specifies which field in the dataset contains training text
    max_seq_length=max_seq_length,  # Defines the maximum sequence length for inputs
    dataset_num_proc=2,  # Uses 2 CPU threads to speed up data preprocessing

    # Define training arguments
    args=TrainingArguments(
        per_device_train_batch_size=2,  # Number of examples processed per device (GPU) at a time
        gradient_accumulation_steps=4,  # Accumulate gradients over 4 steps before updating weights
        num_train_epochs=1, # Full fine-tuning run
        warmup_steps=5,  # Gradually increases learning rate for the first 5 steps
        max_steps=60,  # Limits training to 60 steps (useful for debugging; increase for full fine-tuning)
        learning_rate=2e-4,  # Learning rate for weight updates (tuned for LoRA fine-tuning)
        fp16=not is_bfloat16_supported(),  # Use FP16 (if BF16 is not supported) to speed up training
        bf16=is_bfloat16_supported(),  # Use BF16 if supported (better numerical stability on newer GPUs)
        logging_steps=10,  # Logs training progress every 10 steps
        optim="adamw_8bit",  # Uses memory-efficient AdamW optimizer in 8-bit mode
        weight_decay=0.01,  # Regularization to prevent overfitting
        lr_scheduler_type="linear",  # Uses a linear learning rate schedule
        seed=3407,  # Sets a fixed seed for reproducibility
        output_dir="outputs",  # Directory where fine-tuned model checkpoints will be saved
    ),
)


Tokenizing to ["text"] (num_proc=2):   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
# Start the fine-tuning process
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 500 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/4,670,623,744 (0.90% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,1.9188
20,1.4615
30,1.4023
40,1.3088
50,1.3443
60,1.314


In [None]:
# Save the fine-tuned model
wandb.finish()

0,1
train/epoch,▁▂▄▅▇██
train/global_step,▁▂▄▅▇██
train/grad_norm,█▂▂▁▂▂
train/learning_rate,█▇▅▄▂▁
train/loss,█▃▂▁▁▁

0,1
total_flos,1.8014312853602304e+16
train/epoch,0.96
train/global_step,60.0
train/grad_norm,0.26014
train/learning_rate,0.0
train/loss,1.314
train_loss,1.45829
train_runtime,1272.0314
train_samples_per_second,0.377
train_steps_per_second,0.047


In [None]:
question = """A 61-year-old woman with a long history of involuntary urine loss during activities like coughing or sneezing
              but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings,
              what would cystometry most likely reveal about her residual volume and detrusor contractions?"""

# Load the inference model using FastLanguageModel (Unsloth optimizes for speed)
FastLanguageModel.for_inference(model_lora)  # Unsloth has 2x faster inference!

# Tokenize the input question with a specific prompt format and move it to the GPU
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

# Generate a response using LoRA fine-tuned model with specific parameters
outputs = model_lora.generate(
    input_ids=inputs.input_ids,          # Tokenized input IDs
    attention_mask=inputs.attention_mask, # Attention mask for padding handling
    max_new_tokens=1200,                  # Maximum length for generated response
    use_cache=True,                        # Enable cache for efficient generation
)

# Decode the generated response from tokenized format to readable text
response = tokenizer.batch_decode(outputs)

# Extract and print only the model's response part after "### Response:"
print(response[0].split("### Response:")[1])


<think>
Okay, so let's think about this. We have a 61-year-old woman who's been dealing with involuntary urine loss during things like coughing or sneezing, but she's not leaking at night. That suggests she might have some kind of problem with her pelvic floor muscles or maybe her bladder.

Now, she's got a gynecological exam and a Q-tip test. Let's break that down. The Q-tip test is usually used to check for urethral obstruction. If it's positive, that means there's something blocking the urethra, like a urethral stricture or something else.

If she's experiencing involuntary loss during activities, like coughing, it might mean her pelvic floor muscles aren't working properly. They might not be contracting when they should to support the bladder. This could lead to a problem with the urethral sphincter, which controls the release of urine.

But let's not jump to conclusions. It's important to look at what's happening in the bladder. We need to know about her residual volume and detru

In [None]:
question = """A 59-year-old man presents with a fever, chills, night sweats, and generalized fatigue,
              and is found to have a 12 mm vegetation on the aortic valve. Blood cultures indicate gram-positive, catalase-negative,
              gamma-hemolytic cocci in chains that do not grow in a 6.5% NaCl medium.
              What is the most likely predisposing factor for this patient's condition?"""

# Tokenize the input question with a specific prompt format and move it to the GPU
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

# Generate a response using LoRA fine-tuned model with specific parameters
outputs = model_lora.generate(
    input_ids=inputs.input_ids,          # Tokenized input IDs
    attention_mask=inputs.attention_mask, # Attention mask for padding handling
    max_new_tokens=1200,                  # Maximum length for generated response
    use_cache=True,                        # Enable cache for efficient generation
)

# Decode the generated response from tokenized format to readable text
response = tokenizer.batch_decode(outputs)

# Extract and print only the model's response part after "### Response:"
print(response[0].split("### Response:")[1])


<think>
Okay, let's see. We have a 59-year-old man with some classic symptoms: fever, chills, night sweats, and fatigue. That's pretty telling. And he's got this vegetation on his aortic valve. Hmm, that's interesting. Vegetations are usually associated with endocarditis, right? 

Now, the blood culture result is really key here. The cultures are showing gram-positive, catalase-negative, gamma-hemolytic cocci in chains. Those characteristics are telling us it's a specific type of bacteria, and they're in chains, which suggests they're growing in a particular pattern. And importantly, they don't grow in a 6.5% NaCl medium. That’s a clue because some bacteria can't grow in high salt concentrations. 

Putting this all together, it seems like we're dealing with Enterococcus faecalis. That's a known cause of endocarditis, especially in older patients. Now, why would this happen? Let's think about what could have gone wrong. 

One possibility is that this patient might have an underlying co

In [None]:
# Set parameters
max_seq_length = 2048 # Define the maximum sequence length a model can handle (i.e. how many tokens can be processed at once)
dtype = None # Set to default
load_in_4bit = True # Enables 4 bit quantization — a memory saving optimization

# Load the DeepSeek R1 model and tokenizer using unsloth — imported using: from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="deepseek-ai/deepseek-llm-7b-base",  # Load the pre-trained DeepSeek R1 model (8B parameter version) unsloth/DeepSeek-R1-Distill-Llama-8B
    max_seq_length=max_seq_length, # Ensure the model can process up to 2048 tokens at once
    dtype=dtype, # Use the default data type (e.g., FP16 or BF16 depending on hardware support)
    load_in_4bit=load_in_4bit, # Load the model in 4-bit quantization to save memory
    token=hugging_face_token, # Use hugging face token
)

==((====))==  Unsloth 2025.4.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


pytorch_model.bin.index.json:   0%|          | 0.00/22.5k [00:00<?, ?B/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.6k [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/792 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

deepseek-ai/deepseek-llm-7b-base does not have a padding token! Will use pad_token = <|PAD_TOKEN|>.


In [None]:
from transformers import LogitsProcessorList, LogitsProcessor

class MultipleChoiceOnlyLogitsProcessor(LogitsProcessor):
    def __init__(self, tokenizer, allowed_tokens=["A", "B", "C", "D"]):
        self.allowed_token_ids = [tokenizer.convert_tokens_to_ids(t) for t in allowed_tokens]

    def __call__(self, input_ids, scores):
        mask = torch.full_like(scores, float("-inf"))  # Start with -inf everywhere
        for token_id in self.allowed_token_ids:
            mask[:, token_id] = scores[:, token_id]    # Keep only allowed scores
        return mask

In [None]:
logits_processor = LogitsProcessorList([
    MultipleChoiceOnlyLogitsProcessor(tokenizer)
])