In [1]:
!pip install bitsandbytes accelerate

Collecting bitsandbytes
  Downloading bitsandbytes-0.49.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.49.2-py3-none-manylinux_2_24_x86_64.whl (60.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.7/60.7 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.49.2


In [2]:
from google.colab import userdata
token = userdata.get('HF_TOKEN')

In [3]:
import getpass
from huggingface_hub import login

# Hugging Face login
if token is None:
    token = getpass.getpass("Enter HF token (with write access): " )
login(token=token)

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

# 1. Define the 4-bit configuration (Crucial for Speed & Low RAM)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model_id = "Ismailea04/medgemma-4b-cardiology-merged" # Replace with your actual path

# 2. Load Model with Quantization and Map to GPU
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config, # <--- This fixes the RAM issue
    device_map="auto",               # <--- This puts it on the GPU automatically
    torch_dtype=torch.float32        # <--- Explicitly set torch_dtype
)

Loading model...


config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/883 [00:00<?, ?it/s]

In [5]:
# 3. Define the Inference Function
def generate_response(prompt):
    # Format the prompt exactly like the training data (Alpaca style)
    formatted_prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are a cardiology expert. Answer the following question truthfully.

### Input:
{prompt}

### Response:
"""

    inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda")

    # Ensure pad_token is set for generation
    if tokenizer.pad_token is None:
        if tokenizer.eos_token is not None:
            tokenizer.pad_token = tokenizer.eos_token
        else:
            # Fallback if neither pad_token nor eos_token is available
            # This might require adding a special token if not present
            # For simplicity, assuming eos_token is usually available or can be set.
            # If a model truly has neither, this needs a more complex solution
            # e.g., tokenizer.add_special_tokens({'pad_token': '[PAD]'})
            pass # Or raise an error, or set a dummy

    # Generate (Adjust max_new_tokens for longer answers)
    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id, # Use the now-guaranteed pad_token_id
        )

    # Decode and print only the new part
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Clean up the output to show only the 'Response' part
    return response.split('### Response:')[-1]
# 4. Test it
print(generate_response("What are the main symptoms of Atrial Fibrillation?"))


The main symptoms of Atrial Fibrillation (AFib) are:
*   **Irregular heartbeat (palpitations):** AFib causes the upper chambers of the heart (atria) to beat irregularly and rapidly. This irregular rhythm can cause you to feel like your heart is skipping beats, fluttering, or pounding.
*   **Shortness of breath:** AFib can cause the heart to not pump enough blood to the body. This can lead to shortness of breath, especially during exertion or when lying down.
*   **Fatigue:** AFib can cause the heart to not pump enough blood to the body. This can lead to fatigue.
*   **Chest pain:** AFib can cause the heart to not pump enough blood to the body. This can lead to chest pain.
*   **Dizziness or lightheadedness:** AFib can cause the heart to not pump enough blood to the body. This can lead to dizziness or lightheadedness.
*   **Confusion:** AFib can cause the heart to not pump enough blood to the body. This can lead to confusion.

It is important to seek medical attention for any symptoms 