In [2]:
!pip install unsloth

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting unsloth
  Downloading unsloth-2025.6.11-py3-none-any.whl.metadata (48 kB)
Collecting unsloth_zoo>=2025.6.7 (from unsloth)
  Downloading unsloth_zoo-2025.6.7-py3-none-any.whl.metadata (8.1 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.31-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting triton>=3.0.0 (from unsloth)
  Downloading triton-3.3.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (1.5 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.25-py3-none-any.whl.metadata (12 kB)
Collecting transformers!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,>=4.51.3 (from unsloth)
  Downloading transformers-4.53.0-py3-none-any.whl.metadata (39 kB)
Collecting datasets>=3.4.1 (from unsloth)
  Downloading datasets-3.6.0-py3-none

In [3]:
!pip install xformers

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [4]:
!pip install -U transformers datasets accelerate trl peft loralib nvidia-ml-py3

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting loralib
  Downloading loralib-0.1.2-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-ml-py3
  Downloading nvidia-ml-py3-7.352.0.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25ldone
Downloading loralib-0.1.2-py3-none-any.whl (10 kB)
Building wheels for collected packages: nvidia-ml-py3
  Building wheel for nvidia-ml-py3 (setup.py) ... [?25ldone
[?25h  Created wheel for nvidia-ml-py3: filename=nvidia_ml_py3-7.352.0-py3-none-any.whl size=19171 sha256=5b41329fb55eda204389181a10572555bfed9c0b926ab4617bc6c46d8bc2de08
  Stored in directory: /tmp/pip-ephem-wheel-cache-6gv1r0ko/wheels/5c/d8/c0/46899f8be7a75a2ffd197a23c8797700ea858b9b34819fbf9e
Successfully built nvidia-ml-py3
Installing collected packages: nvidia-ml-py3, loralib
Successfully installed loralib-0.1.2 nvidia-ml-py3-7.352.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[

In [5]:
!pip install --upgrade pip

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.2
    Uninstalling pip-24.2:
      Successfully uninstalled pip-24.2
Successfully installed pip-25.1.1
[0m

In [4]:
from unsloth import FastLanguageModel, is_bfloat16_supported
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from trl import SFTTrainer
from datasets import load_dataset
import torch

# 🛠 Setup model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-1B-Instruct",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=False,
)
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
)

# 🧠 Setup chat template
def get_reasoning_llama_template(tok):
    tok.chat_template = '''<|begin_of_text|>{% for message in messages %}{% if message["role"]=="user" %}<|start_header_id|>user<|end_header_id|>
{{message["content"]}}<|eot_id|>{% elif message["role"]=="reasoning" %}<|start_header_id|>reasoning<|end_header_id|>
{{message["content"]}}<|eot_id|>{% elif message["role"]=="assistant" %}<|start_header_id|>assistant<|end_header_id|>
{{message["content"]}}<|eot_id|>{% endif %}{% endfor %}'''
    return tok

tokenizer = get_reasoning_llama_template(tokenizer)

# 📚 Load & format dataset
dataset = load_dataset(
    "FreedomIntelligence/medical-o1-reasoning-SFT",
    "en",
    split="train",
    trust_remote_code=True
)

def formatting_prompts_func(examples):
    texts = []
    for q, cot, resp in zip(examples["Question"], examples["Complex_CoT"], examples["Response"]):
        conv = [
            {"role": "user", "content": q},
            {"role": "reasoning", "content": cot},
            {"role": "assistant", "content": resp},
        ]
        texts.append(tokenizer.apply_chat_template(conv, tokenize=False))
    return {"text": texts}

dataset = dataset.map(
    formatting_prompts_func,
    batched=True,
    remove_columns=["Question", "Complex_CoT", "Response"]
)

# 🏋 Training
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=2048,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    args=TrainingArguments(
        per_device_train_batch_size=8,
        gradient_accumulation_steps=2,
        num_train_epochs=1,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_torch",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",
    )
)

# 🧠 Train
trainer.train()

# 💾 Save model
model.save_pretrained("llama32_1b_reasoning")
tokenizer.save_pretrained("llama32_1b_reasoning")

print("Training Completed Successfully 🎉")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


Unsloth: If you want to finetune Gemma 2, upgrade flash-attn to version 2.6.3 or higher!
Newer versions support faster and less memory usage kernels for Gemma 2's attention softcapping!
To update flash-attn, do the below:

pip install --no-deps --upgrade "flash-attn>=2.6.3"
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.6.8: Fast Llama patching. Transformers: 4.53.0.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.394 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.6.8 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.
Generating train split: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19704/19704 [00:00<00:00, 23331.20 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19704/19704 [00:00<00:00, 20328.03 examples/s]


Unsloth: We found double BOS tokens - we shall remove one automatically.


Unsloth: Tokenizing ["text"]: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19704/19704 [00:07<00:00, 2814.57 examples/s]
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 19,704 | Num Epochs = 1 | Total steps = 1,232
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 2 x 1) = 16
 "-____-"     Trainable parameters = 11,272,192/1,247,086,592 (0.90% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.0167
2,2.0985
3,2.1125
4,1.9209
5,1.9934
6,1.9516
7,1.9806
8,1.8295
9,1.8907
10,1.7314


Training Completed Successfully 🎉


In [6]:
!pip install peft==0.10.0

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting peft==0.10.0
  Downloading peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.10.0-py3-none-any.whl (199 kB)
Installing collected packages: peft
  Attempting uninstall: peft
    Found existing installation: peft 0.15.2
    Uninstalling peft-0.15.2:
      Successfully uninstalled peft-0.15.2
Successfully installed peft-0.10.0
[0m

In [10]:
!pip install unsloth
import unsloth
from unsloth import FastLanguageModel
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("llama32_1b_reasoning")
tokenizer = AutoTokenizer.from_pretrained("llama32_1b_reasoning")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting unsloth
  Downloading unsloth-2025.6.8-py3-none-any.whl.metadata (46 kB)
Collecting unsloth_zoo>=2025.6.4 (from unsloth)
  Downloading unsloth_zoo-2025.6.6-py3-none-any.whl.metadata (8.1 kB)
Collecting trl!=0.15.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,>=0.7.9 (from unsloth)
  Downloading trl-0.19.0-py3-none-any.whl.metadata (10 kB)
Downloading unsloth-2025.6.8-py3-none-any.whl (280 kB)
Downloading trl-0.19.0-py3-none-any.whl (375 kB)
Downloading unsloth_zoo-2025.6.6-py3-none-any.whl (154 kB)
Installing collected packages: trl, unsloth_zoo, unsloth
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [unsloth]m1/3[0m [unsloth_zoo]
[1A[2KSuccessfully installed trl-0.19.0 unsloth-2025.6.8 unsloth_zoo-2025.6.6
[0m


Please restructure your imports with 'import unsloth' at the top of your file.
  import unsloth


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
cannot import name 'check_torch_load_is_safe' from 'transformers.utils' (/usr/local/lib/python3.10/dist-packages/transformers/utils/__init__.py)

In [15]:

# 🔁 Load fine-tuned model and tokenizer
model = AutoModelForCausalLM.from_pretrained("llama32_1b_reasoning")
tokenizer = AutoTokenizer.from_pretrained("llama32_1b_reasoning")

# 🛠 Apply custom chat template (optional, if needed for your model)
tokenizer.chat_template = '''<|begin_of_text|>{% for message in messages %}{% if message["role"]=="user" %}<|start_header_id|>user<|end_header_id|>
{{message["content"]}}<|eot_id|>{% elif message["role"]=="reasoning" %}<|start_header_id|>reasoning<|end_header_id|>
{{message["content"]}}<|eot_id|>{% elif message["role"]=="assistant" %}<|start_header_id|>assistant<|end_header_id|>
{{message["content"]}}<|eot_id|>{% endif %}{% endfor %}'''

# 🚀 Build inference pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

# 📥 Example input
messages = [
    {"role": "user", "content": "What causes high blood pressure?"},
    {"role": "reasoning", "content": "High blood pressure, or hypertension, is often caused by lifestyle factors such as a high-salt diet, obesity, lack of exercise, stress, and smoking. These factors increase the resistance in blood vessels, forcing the heart to pump harder."}
]


# 🧠 Create prompt from messages
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# 🎯 Run inference
output = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7)

# 📤 Display response
print("\n🧠 Assistant Response:\n")
print(output[0]["generated_text"])


Device set to use cuda:0



🧠 Assistant Response:

<|begin_of_text|><|start_header_id|>user<|end_header_id|>
What causes high blood pressure?<|eot_id|><|start_header_id|>reasoning<|end_header_id|>
High blood pressure, or hypertension, is often caused by lifestyle factors such as a high-salt diet, obesity, lack of exercise, stress, and smoking. These factors increase the resistance in blood vessels, forcing the heart to pump harder.<|eot_id|>assistant
High blood pressure, or hypertension, is a condition where the pressure of the blood in the arteries is consistently higher than it should be. This is usually due to various factors that affect the way the body regulates blood pressure. One of the main causes is lifestyle factors. These include consuming a high-salt diet, which can increase blood volume and blood pressure. Obesity is another factor, as excess body weight can lead to higher blood pressure due to increased vascular resistance. There's also the lack of exercise, which can lead to increased blood pressu

In [11]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

# 🔁 Load fine-tuned model and tokenizer
model = AutoModelForCausalLM.from_pretrained("llama32_1b_reasoning")
tokenizer = AutoTokenizer.from_pretrained("llama32_1b_reasoning")

# 🛠 Chat template with reasoning and final output roles
tokenizer.chat_template = '''<|begin_of_text|>{% for message in messages %}{% if message["role"]=="user" %}<|start_header_id|>user<|end_header_id|>
{{message["content"]}}<|eot_id|>{% elif message["role"]=="reasoning" %}<|start_header_id|>reasoning<|end_header_id|>
{{message["content"]}}<|eot_id|>{% elif message["role"]=="final" %}<|start_header_id|>final<|end_header_id|>
{{message["content"]}}<|eot_id|>{% endif %}{% endfor %}'''

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

# 🔍 Your actual query
query = [
    {"role": "user", "content": "What are the symptoms of AIDS?"}
]

# Combine prompt with few-shot for better generation
messages = few_shot + query + [
    {"role": "reasoning", "content": ""},
    {"role": "final", "content": ""}
]

# Generate
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
output = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7)[0]["generated_text"]

# Extract parts
import re
reasoning_match = re.search(r"<\|start_header_id\|>reasoning<\|end_header_id\|>\n(.*?)<\|eot_id\|>", output, re.DOTALL)
final_match = re.search(r"<\|start_header_id\|>final<\|end_header_id\|>\n(.*?)<\|eot_id\|>", output, re.DOTALL)

reasoning = reasoning_match.group(1).strip() if reasoning_match else "⚠️ Reasoning not found"
final_output = final_match.group(1).strip() if final_match else "⚠️ Final output not found"

# Print
print("\n🧠 Reasoning:\n", reasoning)
print("\n✅ Final Output:\n", final_output)


Device set to use cuda:0



🧠 Reasoning:
 High blood pressure, or hypertension, is often caused by lifestyle factors such as a high-salt diet, obesity, lack of exercise, stress, and smoking. These factors increase the resistance in blood vessels, forcing the heart to pump harder.

✅ Final Output:
 High blood pressure is primarily caused by unhealthy lifestyle habits such as poor diet, obesity, and stress.


In [20]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

# 🔁 Load fine-tuned model and tokenizer
model = AutoModelForCausalLM.from_pretrained("llama32_1b_reasoning")
tokenizer = AutoTokenizer.from_pretrained("llama32_1b_reasoning")

# 🛠 Define chat template with reasoning and final output roles
tokenizer.chat_template = '''<|begin_of_text|>{% for message in messages %}{% if message["role"]=="user" %}<|start_header_id|>user<|end_header_id|>
{{message["content"]}}<|eot_id|>{% elif message["role"]=="reasoning" %}<|start_header_id|>reasoning<|end_header_id|>
{{message["content"]}}<|eot_id|>{% elif message["role"]=="final" %}<|start_header_id|>final<|end_header_id|>
{{message["content"]}}<|eot_id|>{% endif %}{% endfor %}'''

# 🚀 Create pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

# 📥 Prompt 1: Cough
messages_cough = [
    {"role": "user", "content": "What causes a persistent cough?"},
    {"role": "reasoning", "content": "A persistent cough can be caused by various underlying issues such as postnasal drip, asthma, acid reflux (GERD), or chronic infections like bronchitis. In some cases, it may be due to environmental irritants like smoke or dust. The cough reflex is triggered when these irritants or conditions stimulate the airway lining, leading to ongoing coughing."},
    {"role": "final", "content": "Persistent cough is usually caused by conditions like postnasal drip, acid reflux, or respiratory infections."}
]

# 📥 Prompt 2: Fever
messages_fever = [
    {"role": "user", "content": "What causes fever in the human body?"},
    {"role": "reasoning", "content": "Fever occurs when the body raises its internal temperature in response to infection, inflammation, or disease. This is controlled by the hypothalamus in the brain, which reacts to signals like cytokines released during an immune response. Common causes include viral or bacterial infections, autoimmune disorders, or even heat exhaustion. The raised temperature helps the body fight off pathogens."},
    {"role": "final", "content": "Fever is typically caused by infections or inflammation and is the body’s way of fighting off illness."}
]

# 📥 Prompt 3: Headache
messages_headache = [
    {"role": "user", "content": "What causes headaches?"},
    {"role": "reasoning", "content": "Headaches can be caused by multiple factors such as stress, dehydration, poor posture, eye strain, or changes in sleep patterns. Tension headaches are the most common and are often linked to muscle tightness in the neck and scalp. Other causes include migraines, which may be triggered by hormonal changes, certain foods, or sensory stimuli."},
    {"role": "final", "content": "Headaches are commonly caused by stress, dehydration, tension, or underlying conditions like migraines."}
]

# 🧠 Run inference
for topic, messages in [("Cough", messages_cough), ("Fever", messages_fever), ("Headache", messages_headache)]:
    print(f"\n🔎 {topic} Prompt Output:")
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    output = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7)
    print(output[0]["generated_text"])


Device set to use cuda:0



🔎 Cough Prompt Output:
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
What causes a persistent cough?<|eot_id|><|start_header_id|>reasoning<|end_header_id|>
A persistent cough can be caused by various underlying issues such as postnasal drip, asthma, acid reflux (GERD), or chronic infections like bronchitis. In some cases, it may be due to environmental irritants like smoke or dust. The cough reflex is triggered when these irritants or conditions stimulate the airway lining, leading to ongoing coughing.<|eot_id|><|start_header_id|>final<|end_header_id|>
Persistent cough is usually caused by conditions like postnasal drip, acid reflux, or respiratory infections.<|eot_id|>assistant
A persistent cough can be triggered by various factors, including postnasal drip, acid reflux, and respiratory infections. These conditions can lead to irritation of the airway, resulting in a persistent cough. Environmental irritants, such as smoke or dust, can also cause a persistent cough.

🔎 Fe

In [35]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import re

# 🔁 Load fine-tuned model and tokenizer
model = AutoModelForCausalLM.from_pretrained("llama32_1b_reasoning")
tokenizer = AutoTokenizer.from_pretrained("llama32_1b_reasoning")

# 🛠 Chat template with reasoning and final output roles
tokenizer.chat_template = '''<|begin_of_text|>{% for message in messages %}{% if message["role"]=="user" %}<|start_header_id|>user<|end_header_id|>
{{message["content"]}}<|eot_id|>{% elif message["role"]=="reasoning" %}<|start_header_id|>reasoning<|end_header_id|>
{{message["content"]}}<|eot_id|>{% elif message["role"]=="final" %}<|start_header_id|>final<|end_header_id|>
{{message["content"]}}<|eot_id|>{% endif %}{% endfor %}'''

# 🚀 Create inference pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

# ✅ Few-shot example
few_shot = [
    {"role": "user", "content": "What causes high blood pressure?"},
    {"role": "reasoning", "content": "High blood pressure, or hypertension, is often caused by lifestyle factors such as a high-salt diet, obesity, lack of exercise, stress, and smoking. These factors increase the resistance in blood vessels, forcing the heart to pump harder."},
    {"role": "final", "content": "High blood pressure is primarily caused by unhealthy lifestyle habits such as poor diet, obesity, and stress."},
]

# ❓ Actual Query: Cough
query = [
    {"role": "user", "content": "What causes a persistent cough?"}
]

# 🧠 Construct full prompt with placeholders
messages = few_shot + query + [
    {"role": "reasoning", "content": ""},
    {"role": "final", "content": ""}
]

# 🔡 Generate prompt text
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# 🔍 Run inference
output = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7)[0]["generated_text"]

# 📤 Extract structured output
reasoning_match = re.search(r"<\|start_header_id\|>reasoning<\|end_header_id\|>\n(.*?)<\|eot_id\|>", output, re.DOTALL)
final_match = re.search(r"<\|start_header_id\|>final<\|end_header_id\|>\n(.*?)<\|eot_id\|>", output, re.DOTALL)

reasoning = reasoning_match.group(1).strip() if reasoning_match else "⚠️ Reasoning not found"
final_output = final_match.group(1).strip() if final_match else "⚠️ Final output not found"

# 📃 Print results
print("\n🧠 Reasoning:\n", reasoning)
print("\n✅ Final Output:\n", final_output)


Device set to use cuda:0



🧠 Reasoning:
 High blood pressure, or hypertension, is often caused by lifestyle factors such as a high-salt diet, obesity, lack of exercise, stress, and smoking. These factors increase the resistance in blood vessels, forcing the heart to pump harder.

✅ Final Output:
 High blood pressure is primarily caused by unhealthy lifestyle habits such as poor diet, obesity, and stress.


In [23]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import re

# 🔁 Load fine-tuned model
fine_model = AutoModelForCausalLM.from_pretrained("llama32_1b_reasoning")
fine_tokenizer = AutoTokenizer.from_pretrained("llama32_1b_reasoning")

# 🔁 Load base (non-fine-tuned) model — replace with appropriate base model
base_model_id = "unsloth/Llama-3.2-1B-Instruct"  # Example
base_model = AutoModelForCausalLM.from_pretrained(base_model_id)
base_tokenizer = AutoTokenizer.from_pretrained(base_model_id)

# 🛠 Apply chat template (for fine-tuned model only)
fine_tokenizer.chat_template = '''<|begin_of_text|>{% for message in messages %}{% if message["role"]=="user" %}<|start_header_id|>user<|end_header_id|>
{{message["content"]}}<|eot_id|>{% elif message["role"]=="reasoning" %}<|start_header_id|>reasoning<|end_header_id|>
{{message["content"]}}<|eot_id|>{% elif message["role"]=="final" %}<|start_header_id|>final<|end_header_id|>
{{message["content"]}}<|eot_id|>{% endif %}{% endfor %}'''

# 🔧 Inference pipelines
pipe_fine = pipeline("text-generation", model=fine_model, tokenizer=fine_tokenizer)
pipe_base = pipeline("text-generation", model=base_model, tokenizer=base_tokenizer)

# 📥 Input prompt
few_shot = [
    {"role": "user", "content": "What causes high blood pressure?"},
    {"role": "reasoning", "content": "High blood pressure, or hypertension, is often caused by lifestyle factors such as a high-salt diet, obesity, lack of exercise, stress, and smoking. These factors increase the resistance in blood vessels, forcing the heart to pump harder."},
    {"role": "final", "content": "High blood pressure is primarily caused by unhealthy lifestyle habits such as poor diet, obesity, and stress."},
]
query = [{"role": "user", "content": "What causes a persistent cough?"}]
messages = few_shot + query + [
    {"role": "reasoning", "content": ""},
    {"role": "final", "content": ""}
]

# 🔡 Prepare prompts
prompt_fine = fine_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
prompt_base = query[-1]["content"]  # Use just question for base model

# 🔮 Run inference
output_fine = pipe_fine(prompt_fine, max_new_tokens=512, do_sample=True, temperature=0.7)[0]["generated_text"]
output_base = pipe_base(prompt_base, max_new_tokens=512, do_sample=True, temperature=0.7)[0]["generated_text"]

# 🧠 Extract reasoning and final (from fine-tuned only)
reasoning_match = re.search(r"<\|start_header_id\|>reasoning<\|end_header_id\|>\n(.*?)<\|eot_id\|>", output_fine, re.DOTALL)
final_match = re.search(r"<\|start_header_id\|>final<\|end_header_id\|>\n(.*?)<\|eot_id\|>", output_fine, re.DOTALL)
reasoning = reasoning_match.group(1).strip() if reasoning_match else "⚠️ Not found"
final_output = final_match.group(1).strip() if final_match else "⚠️ Not found"

# 📊 Compare outputs
print("\n=== 🔬 Fine-Tuned Model Output ===")
print("\n🧠 Reasoning:\n", reasoning)
print("\n✅ Final Output:\n", final_output)

print("\n=== 🧪 Base Model Output ===")
print(output_base)


Device set to use cuda:0
Device set to use cuda:0



=== 🔬 Fine-Tuned Model Output ===

🧠 Reasoning:
 High blood pressure, or hypertension, is often caused by lifestyle factors such as a high-salt diet, obesity, lack of exercise, stress, and smoking. These factors increase the resistance in blood vessels, forcing the heart to pump harder.

✅ Final Output:
 High blood pressure is primarily caused by unhealthy lifestyle habits such as poor diet, obesity, and stress.

=== 🧪 Base Model Output ===
What causes a persistent cough? Persistent coughing is a common symptom of various respiratory and non-respiratory conditions. Here are some potential causes of persistent coughing:

**Respiratory Conditions:**

1. **Asthma**: A chronic inflammation of the airways, which can cause coughing, wheezing, and shortness of breath.
2. **Bronchitis**: Inflammation of the airways, often caused by a viral or bacterial infection, which can lead to coughing and mucus production.
3. **Pneumonia**: An infection that inflames the air sacs in the lungs, which can 

In [30]:
!pip install evaluate bert-score -q
!pip install nltk rouge_score bert-score evaluate
!pip uninstall -y transformers
!pip cache purge  # optional: to remove old versions
!pip install transformers==4.51.3 --force-reinstall



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0m

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Found existing installation: transformers 4.53.0
Uninstalling transformers-4.53.0:
  Successfully uninstalled transformers-4.53.0
[0m

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[31mERROR: pip cache commands can not function since cache is disabled.[0m[31m
[0m

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting transformers==4.51.3
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting filelock (from transformers==4.51.3)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers==4.51.3)
  Downloading huggingface_hub-0.33.1-py3-none-any.whl.metadata (14 kB)
Collecting numpy>=1.17 (from transformers==4.51.3)
  Downloading numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting packaging>=20.0 (from transformers==4.51.3)
  Downloading packaging-25.0-py3-none-any.whl.metadata (3.3 kB)
Collecting pyyaml>=5.1 (from transformers==4.51.3)
  Downloading PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers==4.51.3)
  Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.me

In [33]:
!pip install evaluate bert-score datasets --quiet

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m

In [36]:


import evaluate

# ✅ Example outputs
fine_tuned_output = "High blood pressure is primarily caused by unhealthy lifestyle habits such as poor diet, obesity, and stress."
base_model_output = "Persistent coughing is a common symptom of various respiratory and non-respiratory conditions. Causes include asthma, bronchitis, pneumonia, GERD, allergies, medications, and nutritional deficiencies."

# ✅ Load metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
bertscore = evaluate.load("bertscore")

# ✅ Compute scores
rouge_result = rouge.compute(predictions=[fine_tuned_output], references=[base_model_output])
bleu_result = bleu.compute(predictions=[fine_tuned_output], references=[base_model_output])
bertscore_result = bertscore.compute(predictions=[fine_tuned_output], references=[base_model_output], lang="en")

# ✅ Print results
print("📊 Evaluation Results")
print(f"🔹 ROUGE-L: {rouge_result['rougeL']:.4f}")
print(f"🔹 BLEU: {bleu_result['bleu']:.4f}")
print(f"🔹 BERTScore (F1): {bertscore_result['f1'][0]:.4f}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


📊 Evaluation Results
🔹 ROUGE-L: 0.0976
🔹 BLEU: 0.0000
🔹 BERTScore (F1): 0.8641


In [54]:
!pip uninstall -y tensorflow tensorflow-cpu tensorflow-gpu bleurt protobuf


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Found existing installation: tensorflow 2.12.0
Uninstalling tensorflow-2.12.0:
  Successfully uninstalled tensorflow-2.12.0
[0mFound existing installation: BLEURT 0.0.2
Uninstalling BLEURT-0.0.2:
  Successfully uninstalled BLEURT-0.0.2
Found existing installation: protobuf 3.20.3
Uninstalling protobuf-3.20.3:
  Successfully uninstalled protobuf-3.20.3
[0m

In [7]:
!pip install tensorflow==2.12.0 protobuf==3.20.3
!pip install git+https://github.com/google-research/bleurt.git
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting tensorflow==2.12.0
  Downloading tensorflow-2.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting flatbuffers>=2.0 (from tensorflow==2.12.0)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast<=0.4.0,>=0.2.1 (from tensorflow==2.12.0)
  Downloading gast-0.4.0-py3-none-any.whl.metadata (1.1 kB)
Collecting google-pasta>=0.1.1 (from tensorflow==2.12.0)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting h5py>=2.9.0 (from tensorflow==2.12.0)
  Downloading h5py-3.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Collecting jax>=0.3.15 (from tensorflow==2.12.0)
  Downloading jax-0.6.2-py3-none-any.whl.metadata (13 kB)
Collecting keras<2.13,>=2.12.0 (from tensorflow==2.12.0)
  Downloading keras-2.12.0-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting libclang>=13.0.0 (fr

In [57]:
import evaluate

bertscore = evaluate.load("bertscore")
results = bertscore.compute(
    predictions=["High blood pressure is caused by poor diet and stress."],
    references=["Hypertension is often due to unhealthy eating and anxiety."],
    lang="en"
)
print("BERTScore F1:", results["f1"][0])


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: 0.9186269044876099


In [8]:
# 📦 Install required packages
!pip install evaluate sacrebleu -q

[0m

In [81]:
!pip install evaluate sacrebleu

import evaluate

# Load metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")
ter = evaluate.load("ter")       # Requires: pip install sacrebleu
chrf = evaluate.load("chrf")
wer = evaluate.load("wer")

# Example prediction and reference
predictions = ["High blood pressure is caused by poor diet and stress."]
references = ["Hypertension is often due to unhealthy eating and anxiety."]

# Compute scores
rouge_result = rouge.compute(predictions=predictions, references=references)
bleu_result = bleu.compute(predictions=predictions, references=[references])
meteor_result = meteor.compute(predictions=predictions, references=references)
ter_result = ter.compute(predictions=predictions, references=references)
chrf_result = chrf.compute(predictions=predictions, references=references)
wer_result = wer.compute(predictions=predictions, references=references)

# Print the correct keys
print("📊 Evaluation Metrics:")
print(f"🔹 ROUGE-L: {rouge_result['rougeL']:.4f}")
print(f"🔹 BLEU:    {bleu_result['bleu']:.4f}")
print(f"🔹 METEOR:  {meteor_result['meteor']:.4f}")
print(f"🔹 TER:     {ter_result['score']:.4f}")     # 🔧 fix: use 'score'
print(f"🔹 CHRF:    {chrf_result['score']:.4f}")
print(f"🔹 WER:     {wer_result:.4f}")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0m

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


📊 Evaluation Metrics:
🔹 ROUGE-L: 0.2105
🔹 BLEU:    0.0000
🔹 METEOR:  0.1485
🔹 TER:     100.0000
🔹 CHRF:    12.9966
🔹 WER:     1.0000


In [65]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import re

# 🔁 Load fine-tuned model
fine_model = AutoModelForCausalLM.from_pretrained("llama32_1b_reasoning")
fine_tokenizer = AutoTokenizer.from_pretrained("llama32_1b_reasoning")

# 🔁 Load base (non-fine-tuned) model — replace with appropriate base model
base_model_id = "unsloth/Llama-3.2-1B-Instruct"
base_model = AutoModelForCausalLM.from_pretrained(base_model_id)
base_tokenizer = AutoTokenizer.from_pretrained(base_model_id)

# 🛠 Apply chat template (for fine-tuned model only)
fine_tokenizer.chat_template = '''<|begin_of_text|>{% for message in messages %}{% if message["role"]=="user" %}<|start_header_id|>user<|end_header_id|>
{{message["content"]}}<|eot_id|>{% elif message["role"]=="reasoning" %}<|start_header_id|>reasoning<|end_header_id|>
{{message["content"]}}<|eot_id|>{% elif message["role"]=="final" %}<|start_header_id|>final<|end_header_id|>
{{message["content"]}}<|eot_id|>{% endif %}{% endfor %}'''

# 🔧 Inference pipelines
pipe_fine = pipeline("text-generation", model=fine_model, tokenizer=fine_tokenizer)
pipe_base = pipeline("text-generation", model=base_model, tokenizer=base_tokenizer)

# 📥 Input prompt (few-shot)
few_shot = [
    {"role": "user", "content": "What causes high blood pressure?"},
    {"role": "reasoning", "content": "High blood pressure, or hypertension, is often caused by lifestyle factors such as a high-salt diet, obesity, lack of exercise, stress, and smoking. These factors increase the resistance in blood vessels, forcing the heart to pump harder."},
    {"role": "final", "content": "High blood pressure is primarily caused by unhealthy lifestyle habits such as poor diet, obesity, and stress."},
]

# ✅ New query: Dengue Fever
query = [{"role": "user", "content": "What causes dengue fever?"}]
messages = few_shot + query + [
    {"role": "reasoning", "content": ""},
    {"role": "final", "content": ""}
]

# 🔡 Prepare prompts
prompt_fine = fine_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
prompt_base = query[-1]["content"]

# 🔮 Run inference
output_fine = pipe_fine(prompt_fine, max_new_tokens=512, do_sample=True, temperature=0.7)[0]["generated_text"]
output_base = pipe_base(prompt_base, max_new_tokens=512, do_sample=True, temperature=0.7)[0]["generated_text"]

# 🧠 Extract reasoning and final (from fine-tuned only)
reasoning_match = re.search(r"<\|start_header_id\|>reasoning<\|end_header_id\|>\n(.*?)<\|eot_id\|>", output_fine, re.DOTALL)
final_match = re.search(r"<\|start_header_id\|>final<\|end_header_id\|>\n(.*?)<\|eot_id\|>", output_fine, re.DOTALL)
reasoning = reasoning_match.group(1).strip() if reasoning_match else "⚠️ Not found"
final_output = final_match.group(1).strip() if final_match else "⚠️ Not found"

# 📊 Compare outputs
print("\n=== 🔬 Fine-Tuned Model Output ===")
print("\n🧠 Reasoning:\n", reasoning)
print("\n✅ Final Output:\n", final_output)

print("\n=== 🧪 Base Model Output ===")
print(output_base)


Device set to use cuda:0
Device set to use cuda:0



=== 🔬 Fine-Tuned Model Output ===

🧠 Reasoning:
 High blood pressure, or hypertension, is often caused by lifestyle factors such as a high-salt diet, obesity, lack of exercise, stress, and smoking. These factors increase the resistance in blood vessels, forcing the heart to pump harder.

✅ Final Output:
 High blood pressure is primarily caused by unhealthy lifestyle habits such as poor diet, obesity, and stress.

=== 🧪 Base Model Output ===
What causes dengue fever? Dengue fever is caused by the dengue virus, which is transmitted through the bite of an infected female Aedes mosquito. The virus is primarily found in tropical and subtropical regions of the world.
The virus is transmitted when an infected mosquito bites a person and then feeds on their blood. The virus then multiplies in the mosquito's body and is transmitted to other mosquitoes through a process called "mosquito vector-borne transmission". The virus is not transmitted through casual contact, such as touching or sharing 

In [67]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

# Load your model
model = AutoModelForCausalLM.from_pretrained("llama32_1b_reasoning")
tokenizer = AutoTokenizer.from_pretrained("llama32_1b_reasoning")

# Reuse your chat template
tokenizer.chat_template = '''<|begin_of_text|>{% for message in messages %}{% if message["role"]=="user" %}<|start_header_id|>user<|end_header_id|>
{{message["content"]}}<|eot_id|>{% elif message["role"]=="reasoning" %}<|start_header_id|>reasoning<|end_header_id|>
{{message["content"]}}<|eot_id|>{% elif message["role"]=="assistant" %}<|start_header_id|>assistant<|end_header_id|>
{{message["content"]}}<|eot_id|>{% endif %}{% endfor %}'''

# Sample question
custom_messages = [
    {"role": "user", "content": "Why do people get flu in winter?"},
    {"role": "reasoning", "content": ""},
    {"role": "assistant", "content": ""}
]

# Generate
prompt = tokenizer.apply_chat_template(custom_messages, tokenize=False, add_generation_prompt=True)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

output = pipe(prompt, max_new_tokens=300, do_sample=True, temperature=0.7)[0]["generated_text"]
print("Generated Output:\n", output)


Device set to use cuda:0


Generated Output:
 <|begin_of_text|><|start_header_id|>user<|end_header_id|>
Why do people get flu in winter?<|eot_id|><|start_header_id|>reasoning<|end_header_id|>
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<|eot_id|>user
So, why do people get the flu, especially in the winter? Let's dive into that.

First, what exactly is the flu? It's a type of viral infection, and it's caused by the influenza virus, which is a bit tricky because it can be spread through various means. So, how does it spread? Hmm, it's not just a one-way street; it can spread through the air, through direct contact with infected people, and even through the air we breathe if someone's respiratory droplets are in the air.

Now, why does it get more intense in the winter? Well, the winter season is a key factor here. It's like a big temperature swing from cold to warm, and this change can really make people more susceptible to infections. 

Oh, and the weather! During winter, there's often more wind, whic

In [69]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score

# Example reference and prediction
reference = "Flu spreads in winter due to people staying indoors more and low humidity helping the virus survive."
prediction = "People get the flu in winter because cold weather keeps them indoors and dry air helps the virus spread."

# Tokenize
reference_tokens = reference.split()
prediction_tokens = prediction.split()

# BLEU with smoothing
smooth = SmoothingFunction().method4
bleu = sentence_bleu([reference_tokens], prediction_tokens, smoothing_function=smooth)
print("BLEU Score (smoothed):", bleu)

# METEOR with tokenized inputs
meteor = meteor_score([reference_tokens], prediction_tokens)
print("METEOR Score:", meteor)


BLEU Score (smoothed): 0.04862844018361942
METEOR Score: 0.44573643410852715


In [75]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

# 🔁 Load your fine-tuned model
model = AutoModelForCausalLM.from_pretrained("llama32_1b_reasoning")
tokenizer = AutoTokenizer.from_pretrained("llama32_1b_reasoning")

# 🧠 Set the same chat template
tokenizer.chat_template = '''<|begin_of_text|>{% for message in messages %}{% if message["role"]=="user" %}<|start_header_id|>user<|end_header_id|>
{{message["content"]}}<|eot_id|>{% elif message["role"]=="reasoning" %}<|start_header_id|>reasoning<|end_header_id|>
{{message["content"]}}<|eot_id|>{% elif message["role"]=="final" %}<|start_header_id|>final<|end_header_id|>
{{message["content"]}}<|eot_id|>{% endif %}{% endfor %}'''

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Example evaluation
messages = [
    {"role": "user", "content": "Why do people get dengue fever?"},
    {"role": "reasoning", "content": ""},
    {"role": "final", "content": ""},
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

output = pipe(prompt, max_new_tokens=300, temperature=0.7)[0]["generated_text"]
print(output)


Device set to use cuda:0


<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Why do people get dengue fever?<|eot_id|><|start_header_id|>reasoning<|end_header_id|>
<|eot_id|><|start_header_id|>final<|end_header_id|>
<|eot_id|>assistant
Dengue fever, a mosquito-borne illness, is caused by the dengue virus. It's a viral infection that affects millions of people worldwide. So, what's behind this? Let's dive into the details.

First off, the virus is spread through the bite of an infected Aedes mosquito. These mosquitoes are the main vectors for the virus. They carry it in their saliva during feeding. When you bite these mosquitoes, they inject the virus into your bloodstream. That's how the infection can start.

Now, why does this happen? Well, it's because the virus has a few tricks up its sleeve. It's a type of RNA virus, which means it can replicate itself and make multiple copies of itself. This is why it spreads so quickly, especially in crowded areas like cities and tropical regions. The virus is also

In [94]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

# Load your model
model = AutoModelForCausalLM.from_pretrained("llama32_1b_reasoning")
tokenizer = AutoTokenizer.from_pretrained("llama32_1b_reasoning")

# Reuse your chat template
tokenizer.chat_template = '''<|begin_of_text|>{% for message in messages %}{% if message["role"]=="user" %}<|start_header_id|>user<|end_header_id|>
{{message["content"]}}<|eot_id|>{% elif message["role"]=="reasoning" %}<|start_header_id|>reasoning<|end_header_id|>
{{message["content"]}}<|eot_id|>{% elif message["role"]=="assistant" %}<|start_header_id|>assistant<|end_header_id|>
{{message["content"]}}<|eot_id|>{% endif %}{% endfor %}'''

# Sample question
custom_messages = [
    {"role": "user", "content": "Why do children get diarrhea after drinking tap water?"},
    {"role": "reasoning", "content": ""},
    {"role": "assistant", "content": ""}
]


# Generate
prompt = tokenizer.apply_chat_template(custom_messages, tokenize=False, add_generation_prompt=True)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

output = pipe(prompt, max_new_tokens=300, do_sample=True, temperature=0.7)[0]["generated_text"]
print("Generated Output:\n", output)


Device set to use cuda:0


OutOfMemoryError: CUDA out of memory. Tried to allocate 1002.00 MiB. GPU 0 has a total capacity of 39.39 GiB of which 18.81 MiB is free. Process 797165 has 20.15 GiB memory in use. Process 933684 has 19.21 GiB memory in use. Of the allocated memory 19.65 GiB is allocated by PyTorch, and 12.85 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [10]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nltk
Successfully installed nltk-3.9.1
[0m

In [12]:
!pip install --upgrade transformers peft

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting peft
  Downloading peft-0.15.2-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.15.2-py3-none-any.whl (411 kB)
Installing collected packages: peft
  Attempting uninstall: peft
    Found existing installation: peft 0.10.0
    Uninstalling peft-0.10.0:
      Successfully uninstalled peft-0.10.0
Successfully installed peft-0.15.2
[0m

In [22]:
# !pip install rouge_score
# !pip install jiwer
!pip install bert_score

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
Installing collected packages: bert_score
Successfully installed bert_score-0.3.13
[0m

In [1]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

# Load your model
model = AutoModelForCausalLM.from_pretrained("llama32_1b_reasoning")
tokenizer = AutoTokenizer.from_pretrained("llama32_1b_reasoning")

# Reuse your chat template
tokenizer.chat_template = '''<|begin_of_text|>{% for message in messages %}{% if message["role"]=="user" %}<|start_header_id|>user<|end_header_id|>
{{message["content"]}}<|eot_id|>{% elif message["role"]=="reasoning" %}<|start_header_id|>reasoning<|end_header_id|>
{{message["content"]}}<|eot_id|>{% elif message["role"]=="assistant" %}<|start_header_id|>assistant<|end_header_id|>
{{message["content"]}}<|eot_id|>{% endif %}{% endfor %}'''

# Sample question
custom_messages = [
    {"role": "user", "content": "Why do children get diarrhea after drinking tap water?"},
    {"role": "reasoning", "content": ""},
    {"role": "assistant", "content": ""}
]


# Generate
prompt = tokenizer.apply_chat_template(custom_messages, tokenize=False, add_generation_prompt=True)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

output = pipe(prompt, max_new_tokens=300, do_sample=True, temperature=0.7)[0]["generated_text"]
print("Generated Output:\n", output)


ModuleNotFoundError: No module named 'transformers'