In [None]:
import os
# note this: set them when running
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
num_gpus = 1

In [None]:
import json
from peft import PeftModel, PeftConfig
from unsloth.chat_templates import get_chat_template 
import lm_eval
from lm_eval import evaluator, tasks
from lm_eval.utils import setup_logging
import tempfile
setup_logging("INFO") 


In [None]:
gemma3_270m_pt_args = (
    "pretrained=google/gemma-3-270m,"
    "tokenizer=google/gemma-3-270m,"
    "dtype=bfloat16,"
    "trust_remote_code=True,"
)

gemma3_270m_it_args = (
    "pretrained=google/gemma-3-270m-it,"
    "tokenizer=google/gemma-3-270m-it,"
    "dtype=bfloat16,"
    "trust_remote_code=True,"
)

gemma3_1b_pt_args = (
    "pretrained=google/gemma-3-1b-pt,"
    "tokenizer=google/gemma-3-1b-pt,"
    "dtype=bfloat16,"
    "trust_remote_code=True,"
)

gemma3_1b_it_args = (
    "pretrained=google/gemma-3-1b-it,"
    "tokenizer=google/gemma-3-1b-it,"
    "dtype=bfloat16,"
    "trust_remote_code=True,"
)

gemma3_4b_pt_args = (
    "pretrained=google/gemma-3-4b-pt,"
    "tokenizer=google/gemma-3-4b-pt,"
    "dtype=bfloat16,"
    "trust_remote_code=True,"
)

gemma3_4b_it_args = (
    "pretrained=google/gemma-3-4b-it,"
    "tokenizer=google/gemma-3-4b-it,"
    "dtype=bfloat16,"
    "trust_remote_code=True,"
)

gemma3_12b_pt_args = (
    "pretrained=google/gemma-3-12b-pt,"
    "tokenizer=google/gemma-3-12b-pt,"
    "dtype=bfloat16,"
    "trust_remote_code=True,"
    "load_in_4bit=True,"
    "bnb_4bit_quant_type=nf4,"
    "bnb_4bit_compute_dtype=bfloat16"
)

gemma3_12b_it_args = (
    "pretrained=google/gemma-3-12b-it,"
    "tokenizer=google/gemma-3-12b-it,"
    "dtype=bfloat16,"
    "trust_remote_code=True,"
    "load_in_4bit=True,"
    "bnb_4bit_quant_type=nf4,"
    "bnb_4bit_compute_dtype=bfloat16"
)

gemma3_27b_pt_args = (
    "pretrained=google/gemma-3-27b-pt,"
    "tokenizer=google/gemma-3-27b-pt,"
    "dtype=bfloat16,"
    "trust_remote_code=True,"
    "load_in_4bit=True,"
    "bnb_4bit_quant_type=nf4,"
    "bnb_4bit_compute_dtype=bfloat16"
)

gemma3_27b_it_args = (
    "pretrained=google/gemma-3-27b-it,"
    "tokenizer=google/gemma-3-27b-it,"
    "dtype=bfloat16," 
    "trust_remote_code=True,"
    "load_in_4bit=True,"
    "bnb_4bit_quant_type=nf4,"
    "bnb_4bit_compute_dtype=bfloat16"
)

In [None]:
gemma3_models_full = {
    "google/gemma-3-270m-pt" : gemma3_270m_pt_args,
    "google/gemma-3-270m-it" : gemma3_270m_it_args,
    "google/gemma-3-1b-pt" : gemma3_1b_pt_args,   
    "google/gemma-3-1b-it" : gemma3_1b_it_args,   
    "google/gemma-3-4b-pt" : gemma3_4b_pt_args,
    "google/gemma-3-4b-it" : gemma3_4b_it_args,
    "google/gemma-3-12b-pt" : gemma3_12b_pt_args,
    "google/gemma-3-12b-it" : gemma3_12b_it_args,
    "google/gemma-3-27b-pt" : gemma3_27b_pt_args,
    "google/gemma-3-27b-it" : gemma3_27b_it_args
}

gemma3_models = {
    "google/gemma-3-270m-pt" : gemma3_270m_pt_args,
    "google/gemma-3-270m-it" : gemma3_270m_it_args
}

In [None]:
# TODO add adapters evaluation
gemma3_lora_adapters  = {
    "google/gemma-3-270m-it" :  {
        "classification" : "Mhara/google_gemma-3-270m-it_ft_ag_news",
        "qa" : "Mhara/google_gemma-3-270m-it_ft_squad_v2"
    },
    "google/gemma-3-1b-it" :  {
        "classification" : "Mhara/google_gemma-3-1b-it_ft_ag_news",
        "qa" : "Mhara/google_gemma-3-1b-it_ft_squad_v2"
    },
    "google/gemma-3-4b-it" :  {
        "classification" : "Mhara/google_gemma-3-4b-it_ft_ag_news",
        "qa" : "LINK_TO_HF_SAVED_LORA_ADAPTER_QA"
    },
    "google/gemma-3-12b-it" :  {
        "classification" : "LINK_TO_HF_SAVED_LORA_ADAPTER_CLS",
        "qa" : "LINK_TO_HF_SAVED_LORA_ADAPTER_QA"
    },
    "google/gemma-3-27b-it" :  {
        "classification" : "LINK_TO_HF_SAVED_LORA_ADAPTER_CLS",
        "qa" : "LINK_TO_HF_SAVED_LORA_ADAPTER_QA"
    },
} 

def load_adapter(base_model_id, adapter_id):
    cfg = PeftConfig.from_pretrained(adapter_id)
    base_id = cfg.base_model_name_or_path or base_model_id

    _tok = AutoTokenizer.from_pretrained(base_id, use_fast=True, trust_remote_code=True)
    base = AutoModelForCausalLM.from_pretrained(
        base_id,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
    )
    _model = PeftModel.from_pretrained(base, adapter_id)
    _model.eval()
    tok = get_chat_template(_tok, chat_template="gemma3")

    return _model, _tok

In [None]:
datasets_to_evaluate_on = {
    "question_answering"  : [
        "squadv2", # SQuAD 
        "triviaqa", 
        "nq_open", # natural queustions testing long context evaluation
        "boolq", #boolq
        "social_iqa", # social  QA 
    ],
    "classification" : [
        "ag_news", #AG news
        "sst2",
        "hellaswag", # HellaSwag
        "arc_easy",
        "piqa" #piqa
    ]
}

In [None]:
icl_variants = {
    "k_shot": [0, 5, 10, 25],
    "decoding_strategy": {
        "default": {
            "temperature": 1,
            "top_p": 0.95,
            "top_k": 64,
            "max_gen_toks": 125,
            "do_sample": True
        },
        "greedy": {
            "temperature": 0,
            "do_sample": False,
            "max_gen_toks": 125
        },
        "beam": {
            "num_beams": 5,
            "temperature": 0,
            "do_sample": False,
            "max_gen_toks": 125
        },
        "top_p": {
            "do_sample": True,
            "top_p": 0.9,
            "temperature": 0.7,
            "max_gen_toks": 125
        }
    }
}


In [None]:
def clean_gpu():
    import os
    os.system("""
    echo "Cleaning up vLLM and CUDA contexts"
    pkill -f "vllm" || true
    pkill -f "engine_core" || true
    pkill -f "torchrun" || true
    sleep 2
    fuser -k /dev/nvidia* || true
    """)
clean_gpu()

In [None]:
def _result_path(model_id: str, task: str, n_shot: int, ds_name: str) -> str:
    model_id = model_id.replace("/", "_")
    out_dir = os.path.join("results",model_id , task)
    return os.path.join(out_dir, f"{model_id}_{n_shot}shot_{ds_name}.json")

def _result_exists_and_valid(path: str) -> bool:
    if not os.path.exists(path):
        return False
    try:
        with open(path, "r") as f:
            data = json.load(f)
    except Exception:
        return False

    metric_keys = ("acc", "acc_norm", "em", "f1",
                   "acc,none", "acc_norm,none", "em,none", "f1,none")
    return any(k in data for k in metric_keys)


def _safe_save_json(path: str, obj: dict):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with tempfile.NamedTemporaryFile("w", delete=False, dir=os.path.dirname(path), suffix=".tmp") as tmp:
        json.dump(obj, tmp, indent=2)
        tmp_path = tmp.name
    os.replace(tmp_path, path)

In [None]:
model_name = "hf"
os.makedirs("results", exist_ok=True)

for model_id, model_args in gemma3_models.items():
    if not model_id.endswith("it"):continue
    if isinstance(model_args, (list, tuple)):
        model_args_str = ",".join([str(part) for part in model_args if part])
    else:
        model_args_str = str(model_args)

    for task_type, datasets in datasets_to_evaluate_on.items():
        for task in datasets:
            k_shot_variants = icl_variants["k_shot"]
            default_ds = icl_variants["decoding_strategy"]["default"]
            ds_name = "default"

            best_k_shot = 10
            best_k_perf = None

            for n_shot in k_shot_variants:
                out_path = _result_path(model_id, task, n_shot, ds_name)

                if _result_exists_and_valid(out_path):
                    print(f"Skip (already done): {out_path}")
                    with open(out_path, "r") as f:
                        metrics = json.load(f)
                else:
                    print(f"\nðŸ”¹ Evaluating model: {model_id} | task={task} | {n_shot}-shot | strategy=default")
                    results = evaluator.simple_evaluate(
                        model=model_name,
                        model_args=model_args_str,
                        tasks=[task],
                        num_fewshot=n_shot,
                        gen_kwargs=default_ds,
                        batch_size="auto",
                        device="auto",
                    )["results"]

                    metrics = results[task]
                    _safe_save_json(out_path, metrics)

                if task_type == "classification":
                    metric_val = (
                        metrics.get("acc_norm,none")
                        or metrics.get("acc_norm")
                        or metrics.get("acc")
                    )
                else:  
                    metric_val = (
                        metrics.get("em,none")
                        or metrics.get("em")
                        or metrics.get("f1,none")
                        or metrics.get("f1")
                    )

                if metric_val is not None and (best_k_perf is None or metric_val > best_k_perf):
                    best_k_perf = metric_val
                    best_k_shot = n_shot

                clean_gpu()

            print(f"Best performance with {best_k_shot}-shot(s): {best_k_perf} | type={task_type} | task={task}")

            for decoding_strategy, ds_kwargs in icl_variants["decoding_strategy"].items():
                if decoding_strategy == "default":
                    continue

                out_path = _result_path(model_id, task, best_k_shot, decoding_strategy)

                if _result_exists_and_valid(out_path):
                    print(f"Skip (already done): {out_path}")
                    with open(out_path, "r") as f:
                        metrics = json.load(f)
                else:
                    print(f"\nðŸ”¹ Evaluating model: {model_id} | task={task} | {best_k_shot}-shot | strategy={decoding_strategy}")
                    results = evaluator.simple_evaluate(
                        model=model_name,
                        model_args=model_args_str,
                        tasks=[task],
                        num_fewshot=best_k_shot,
                        gen_kwargs=ds_kwargs,
                        batch_size="auto",
                        device="auto",
                    )["results"]

                    metrics = results[task]
                    _safe_save_json(out_path, metrics)
                    print(f"Saved to {out_path}")

                clean_gpu()
