In [None]:
from google.colab import userdata

from huggingface_hub import login
login(token=userdata.get('HF_TOKEN'), add_to_git_credential=True)

In [None]:
function_code = """
from typing import List

def below_zero(operations: List[int]) -> bool:
    \"\"\"
    You're given a list of deposit and withdrawal operations
    on a bank account that starts with zero balance. Your task is to
    detect if at any point the balance of account falls below zero,
    and at that point function should return True.
    \"\"\"
    balance = 0
    for op in operations:
        balance += op
        if balance < 0:
            return True
    return False
"""



complete_prompt_text = f"""
Please act as an expert Python software engineer. Given the python function below:
{function_code}
I would appreciate it if you could generate a complete and professional Google-style docstring.
The docstring should not include any extra commentary, strictly limited to include the docstring itself and the original function code.
CODE ONLY. Use standard Python indentation. Thank you.
Do not add explanations, notes, or text outside of the code.
Return only the function code with its docstring, without markdown fences or extra text before or after.
"""


concise_prompt_text = f"""
Generate COMPLETE GOOGLE style docstring for the following Python function:
{function_code}
Output the docstring with the function code. Do not include explanations, notes, or text outside the code.
Return only the function code with its docstring, without markdown fences or extra text.
"""

ultra_concise_prompt_text = f"""Add GOOGLE style docstring to function:
{function_code}
Output code only, no text."""


PROMPTS = {
    "complete_prompt_text": complete_prompt_text,
    "concise_prompt_text": concise_prompt_text,
    "ultra_concise_prompt_text": ultra_concise_prompt_text
}

In [None]:
# %%
import os, time, random
import math
import torch
import pandas as pd
from typing import Dict, List
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, set_seed

MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"

# Where to save the CSV
OUT_DIR = "./mistral_benchmark_out"
os.makedirs(OUT_DIR, exist_ok=True)
CSV_PATH = os.path.join(OUT_DIR, "mistral7b_quant_benchmark.csv")

# Generation settings (tweak as needed)
MAX_NEW_TOKENS = 512
TEMPERATURE    = 1.0

# Epoch count
EPOCHS = 5

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"



In [None]:
# %%
def load_model_and_tokenizer(quantization: str):
    tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)

    if quantization == "unquantized":
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
            device_map="auto",
        )
    elif quantization == "8bit":
        bnb_config = BitsAndBytesConfig(load_in_8bit=True)
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            quantization_config=bnb_config,
            device_map="auto",
        )
    elif quantization == "4bit":
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16 if device == "cuda" else torch.float32,
            bnb_4bit_use_double_quant=True,
        )
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            quantization_config=bnb_config,
            device_map="auto",
        )
    else:
        raise ValueError(f"Unknown quantization: {quantization}")

    return model, tok


In [None]:
def count_tokens(text: str, tok: AutoTokenizer) -> int:
    # Avoid adding special tokens here; instruction formatting already adds them if present.
    return len(tok(text, add_special_tokens=False)["input_ids"])


In [None]:
@torch.inference_mode()
def generate_once(model, tok, prompt_text: str, seed: int = None):
    if seed is None:
        seed = random.randint(1, 10_000_000)
    set_seed(seed)

    # Prepare inputs
    enc = tok(prompt_text, return_tensors="pt", add_special_tokens=True)
    input_ids = enc["input_ids"].to(model.device)

    # Measure inference time
    if torch.cuda.is_available():
        torch.cuda.synchronize()
    t0 = time.perf_counter()
    out = model.generate(
        input_ids=input_ids,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=True,
        temperature=TEMPERATURE,

        pad_token_id=tok.eos_token_id,
        eos_token_id=tok.eos_token_id,
    )
    if torch.cuda.is_available():
        torch.cuda.synchronize()
    t1 = time.perf_counter()

    # Decode
    full_text = tok.decode(out[0], skip_special_tokens=True)
    # Extract just the newly generated part (approx) by slicing
    generated = tok.decode(out[0][input_ids.shape[-1]:], skip_special_tokens=True)

    # Token counts
    T_in  = count_tokens(prompt_text, tok)
    T_out = count_tokens(generated, tok)
    T_tot = T_in + T_out

    return {
        "generated_text": generated.strip(),
        "inference_time_s": t1 - t0,
        "T_in": T_in,
        "T_out": T_out,
        "T_total": T_tot,
        "seed": seed,
    }


In [None]:
def format_inst(prompt: str, system: str = None) -> str:
    if system:
        return f"[INST] {system}\n\n{prompt} [/INST]"
    return f"[INST] {prompt} [/INST]"

In [None]:
!nvidia-smi
!pip -q install -U bitsandbytes accelerate transformers

Mon Oct  6 06:20:10 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   70C    P8             11W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
import bitsandbytes as bnb, torch
print("bitsandbytes:", bnb.__version__)
print("Torch:", torch.__version__, "CUDA:", torch.version.cuda)

bitsandbytes: 0.48.1
Torch: 2.8.0+cu126 CUDA: 12.6


In [None]:
# %%
results: List[Dict] = []
quantizations = ["unquantized", "8bit", "4bit"]

for quant in quantizations:
    print(f"\n=== Loading model ({quant}) ===")
    model, tok = load_model_and_tokenizer(quant)
    model.eval()

    for prompt_name, prompt_text in PROMPTS.items():
        inst = format_inst(prompt_text)  # add [INST] ... [/INST]
        for epoch in range(1, EPOCHS + 1):
            rec = generate_once(model, tok, inst, seed=epoch)  # seed by epoch for reproducibility
            results.append({
                "Model": MODEL_ID.split("/")[-1],     # e.g., Mistral-7B-Instruct-v0.3
                "Quantization": quant,
                "Prompt": prompt_name,                # "complete" / "concise" / "ultra"
                "Epoch": epoch,
                "Output": rec["generated_text"],
                "T_in": rec["T_in"],
                "T_out": rec["T_out"],
                "T_total": rec["T_total"],
                "InferenceTime": rec["inference_time_s"],
                "Accuracy": True,                     # per your spec, leave True
                "Hardware": "Cloud",                  # per your spec
            })
            print(f"[{quant} | {prompt_name} | epoch {epoch}] "
                  f"T_in={rec['T_in']} T_out={rec['T_out']} T_total={rec['T_total']} "
                  f"time={rec['inference_time_s']:.3f}s")

    # free memory between quantizations
    del model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()



=== Loading model (unquantized) ===


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


[unquantized | complete_prompt_text | epoch 1] T_in=241 T_out=202 T_total=443 time=78.288s
[unquantized | complete_prompt_text | epoch 2] T_in=241 T_out=291 T_total=532 time=99.957s
[unquantized | complete_prompt_text | epoch 3] T_in=241 T_out=121 T_total=362 time=41.378s
[unquantized | complete_prompt_text | epoch 4] T_in=241 T_out=165 T_total=406 time=56.381s
[unquantized | complete_prompt_text | epoch 5] T_in=241 T_out=150 T_total=391 time=51.173s
[unquantized | concise_prompt_text | epoch 1] T_in=184 T_out=130 T_total=314 time=44.403s
[unquantized | concise_prompt_text | epoch 2] T_in=184 T_out=96 T_total=280 time=33.140s
[unquantized | concise_prompt_text | epoch 3] T_in=184 T_out=102 T_total=286 time=34.794s
[unquantized | concise_prompt_text | epoch 4] T_in=184 T_out=91 T_total=275 time=31.148s
[unquantized | concise_prompt_text | epoch 5] T_in=184 T_out=91 T_total=275 time=31.205s
[unquantized | ultra_concise_prompt_text | epoch 1] T_in=139 T_out=143 T_total=282 time=48.488s
[u

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

[8bit | complete_prompt_text | epoch 1] T_in=241 T_out=159 T_total=400 time=29.576s
[8bit | complete_prompt_text | epoch 2] T_in=241 T_out=213 T_total=454 time=39.207s
[8bit | complete_prompt_text | epoch 3] T_in=241 T_out=121 T_total=362 time=22.642s
[8bit | complete_prompt_text | epoch 4] T_in=241 T_out=238 T_total=479 time=44.509s
[8bit | complete_prompt_text | epoch 5] T_in=241 T_out=150 T_total=391 time=27.806s
[8bit | concise_prompt_text | epoch 1] T_in=184 T_out=98 T_total=282 time=18.486s
[8bit | concise_prompt_text | epoch 2] T_in=184 T_out=96 T_total=280 time=17.617s
[8bit | concise_prompt_text | epoch 3] T_in=184 T_out=102 T_total=286 time=19.175s
[8bit | concise_prompt_text | epoch 4] T_in=184 T_out=91 T_total=275 time=16.789s
[8bit | concise_prompt_text | epoch 5] T_in=184 T_out=95 T_total=279 time=17.981s
[8bit | ultra_concise_prompt_text | epoch 1] T_in=139 T_out=115 T_total=254 time=21.111s
[8bit | ultra_concise_prompt_text | epoch 2] T_in=139 T_out=106 T_total=245 time

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

[4bit | complete_prompt_text | epoch 1] T_in=241 T_out=203 T_total=444 time=15.100s
[4bit | complete_prompt_text | epoch 2] T_in=241 T_out=143 T_total=384 time=10.848s
[4bit | complete_prompt_text | epoch 3] T_in=241 T_out=156 T_total=397 time=11.981s
[4bit | complete_prompt_text | epoch 4] T_in=241 T_out=149 T_total=390 time=11.400s
[4bit | complete_prompt_text | epoch 5] T_in=241 T_out=207 T_total=448 time=15.459s
[4bit | concise_prompt_text | epoch 1] T_in=184 T_out=126 T_total=310 time=9.672s
[4bit | concise_prompt_text | epoch 2] T_in=184 T_out=96 T_total=280 time=6.937s
[4bit | concise_prompt_text | epoch 3] T_in=184 T_out=161 T_total=345 time=12.111s
[4bit | concise_prompt_text | epoch 4] T_in=184 T_out=142 T_total=326 time=10.741s
[4bit | concise_prompt_text | epoch 5] T_in=184 T_out=102 T_total=286 time=7.979s
[4bit | ultra_concise_prompt_text | epoch 1] T_in=139 T_out=154 T_total=293 time=11.533s
[4bit | ultra_concise_prompt_text | epoch 2] T_in=139 T_out=114 T_total=253 time

In [None]:
# %%
df = pd.DataFrame(results, columns=[
    "Model","Quantization","Prompt","Epoch","Output","T_in","T_out","T_total","InferenceTime","Accuracy","Hardware"
])
df.to_csv(CSV_PATH, index=False)
CSV_PATH, df.head()


('./mistral_benchmark_out/mistral7b_quant_benchmark.csv',
                       Model Quantization                Prompt  Epoch  \
 0  Mistral-7B-Instruct-v0.3  unquantized  complete_prompt_text      1   
 1  Mistral-7B-Instruct-v0.3  unquantized  complete_prompt_text      2   
 2  Mistral-7B-Instruct-v0.3  unquantized  complete_prompt_text      3   
 3  Mistral-7B-Instruct-v0.3  unquantized  complete_prompt_text      4   
 4  Mistral-7B-Instruct-v0.3  unquantized  complete_prompt_text      5   
 
                                               Output  T_in  T_out  T_total  \
 0  ```python\nfrom typing import List\n\n"""\nGiv...   241    202      443   
 1  ```python\n"""\nDetects if the balance of a ba...   241    291      532   
 2  ```python\nfrom typing import List\n\ndef belo...   241    121      362   
 3  ```python\nfrom typing import List\n\n"""\nDet...   241    165      406   
 4  ```python\nfrom typing import List\n\n'''\nDet...   241    150      391   
 
    InferenceTime  A