In [1]:
import os
# note this: set them when running
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3,4"
num_gpus = 4

In [2]:
import json
import lm_eval
from lm_eval import evaluator, tasks
from lm_eval.utils import setup_logging
setup_logging("DEBUG") 


INFO 10-09 21:59:01 [__init__.py:216] Automatically detected platform cuda.


In [3]:
gemma3_270m_pt_args = (
    "pretrained=google/gemma-3-270m,"
    "tokenizer=google/gemma-3-270m,"
    "dtype=bfloat16,"
    "trust_remote_code=True"
)

gemma3_270m_it_args = (
    "pretrained=google/gemma-3-270m-it,"
    "tokenizer=google/gemma-3-270m-it,"
    "dtype=bfloat16,"
    "trust_remote_code=True"
)

gemma3_1b_pt_args = (
    "pretrained=google/gemma-3-1b-pt,"
    "tokenizer=google/gemma-3-1b-pt,"
    "dtype=bfloat16,"
    "trust_remote_code=True"
)

gemma3_1b_it_args = (
    "pretrained=google/gemma-3-1b-it,"
    "tokenizer=google/gemma-3-1b-it,"
    "dtype=bfloat16,"
 
)

gemma3_4b_pt_args = (
    "pretrained=google/gemma-3-4b-pt,"
    "tokenizer=google/gemma-3-4b-pt,"
    "dtype=bfloat16,"
    "trust_remote_code=True"
)

gemma3_4b_it_args = (
    "pretrained=google/gemma-3-4b-it,"
    "tokenizer=google/gemma-3-4b-it,"
    "dtype=bfloat16,"

)

gemma3_12b_pt_args = (
    "pretrained=google/gemma-3-12b-pt,"
    "tokenizer=google/gemma-3-12b-pt,"
    "dtype=bfloat16,"
    "trust_remote_code=True"
)

gemma3_12b_it_args = (
    "pretrained=google/gemma-3-12b-it,"
    "tokenizer=google/gemma-3-12b-it,"
    "dtype=bfloat16,"
    "trust_remote_code=True"
)

gemma3_27b_pt_args = (
    "pretrained=google/gemma-3-27b-pt,"
    "tokenizer=google/gemma-3-27b-pt,"
    "dtype=bfloat16,"
    "trust_remote_code=True"
)

gemma3_27b_it_args = (
    "pretrained=google/gemma-3-27b-it,"
    "tokenizer=google/gemma-3-27b-it,"
    "dtype=bfloat16," 
    "trust_remote_code=True"
)

In [4]:
gemma3_models = {
    "google/gemma-3-270m-pt" : gemma3_270m_pt_args,
    "google/gemma-3-270m-it" : gemma3_270m_it_args,
    "google/gemma-3-1b-pt" : gemma3_1b_pt_args,   
    "google/gemma-3-1b-it" : gemma3_1b_it_args,   
    "google/gemma-3-4b-pt" : gemma3_4b_pt_args,
    "google/gemma-3-4b-it" : gemma3_4b_it_args,
    "google/gemma-3-12b-pt" : gemma3_12b_pt_args,
    "google/gemma-3-12b-it" : gemma3_12b_it_args,
    "google/gemma-3-27b-pt" : gemma3_27b_pt_args,
    "google/gemma-3-27b-it" : gemma3_27b_it_args
}

In [5]:
gemma3_lora_adapters  = {
    "google/gemma-3-270m-it" :  {
        "classification" : "LINK_TO_HF_SAVED_LORA_ADAPTER_CLS",
        "qa" : "LINK_TO_HF_SAVED_LORA_ADAPTER_QA"
    },
    "google/gemma-3-1b-it" :  {
        "classification" : "LINK_TO_HF_SAVED_LORA_ADAPTER_CLS",
        "qa" : "LINK_TO_HF_SAVED_LORA_ADAPTER_QA"
    },
    "google/gemma-3-4b-it" :  {
        "classification" : "LINK_TO_HF_SAVED_LORA_ADAPTER_CLS",
        "qa" : "LINK_TO_HF_SAVED_LORA_ADAPTER_QA"
    },
    "google/gemma-3-12b-it" :  {
        "classification" : "LINK_TO_HF_SAVED_LORA_ADAPTER_CLS",
        "qa" : "LINK_TO_HF_SAVED_LORA_ADAPTER_QA"
    },
    "google/gemma-3-27b-it" :  {
        "classification" : "LINK_TO_HF_SAVED_LORA_ADAPTER_CLS",
        "qa" : "LINK_TO_HF_SAVED_LORA_ADAPTER_QA"
    },
} 



In [18]:
datasets_to_evaluate_on = {
    "question_answering"  : [
        "squadv2", # SQuAD 
        "triviaqa", 
        "nq_open", # natural queustions testing long context evaluation
        "boolq", #boolq
        "social_iqa", # social  QA 
    ],
    "classification" : [
        "ag_news", #AG news
        "sst2",
        "hellaswag", # HellaSwag
        "arc_easy",
        "piqa" #piqa
    ]
}

In [19]:
icl_variants = {
    "k_shot": [0, 5, 10, 25],
    "decoding_strategy": {
        "default": {
            "temperature": 1,
            "top_p": 0.95,
            "top_k": 64,
            "max_gen_toks": 125,
            "do_sample": True
        },
        "greedy": {
            "temperature": 0,
            "do_sample": False,
            "max_gen_toks": 125
        },
        "beam": {
            "num_beams": 5,
            "temperature": 0,
            "do_sample": False,
            "max_gen_toks": 125
        },
        "top_p": {
            "do_sample": True,
            "top_p": 0.9,
            "temperature": 0.7,
            "max_gen_toks": 125
        }
    }
}


In [20]:
def clean_gpu():
    import os
    os.system("""
    echo "Cleaning up vLLM and CUDA contexts"
    pkill -f "vllm" || true
    pkill -f "engine_core" || true
    pkill -f "torchrun" || true
    sleep 2
    fuser -k /dev/nvidia* || true
    """)
clean_gpu()

Cleaning up vLLM and CUDA contexts


In [21]:
model_name = "hf"
os.makedirs("results", exist_ok=True)
for model in gemma3_models.keys():
    if not model.endswith("it"):
        continue
    model_args_str = "".join([part for part in gemma3_models[model] if part])
    for task_type, datasets in datasets_to_evaluate_on.items():
        for task in datasets:
            k_shot_variants = icl_variants['k_shot']
            default_ds = icl_variants['decoding_strategy']['default']
            
            best_k_shot  = None
            best_k_perf = None
            
            for n_shot in k_shot_variants:
                print(f"\nðŸ”¹ Evaluating model  : {model} on task {task} with ({n_shot}-shots), default strategy")
                results = evaluator.simple_evaluate(
                    model=model_name,
                    model_args=model_args_str,
                    tasks=[task],
                    num_fewshot=n_shot,
                    gen_kwargs=default_ds,
                    batch_size="auto",
                    device="auto",  
                )["results"]
                
                out_dir = os.path.join("results", model.replace("/", "_"), task)
                out_name = os.path.join(out_dir, f"{model.replace('/', '_')}_{n_shot}shot_default_ds.json")
                os.makedirs(out_dir, exist_ok=True)

                with open(out_name, "w") as f:
                    json.dump(results[task], f, indent=2)
                print(f"Saved to {out_name}")
                
                if task_type == "classification":
                    if results[task]['acc_norm,none'] > best_k_perf:
                        metric_val = (
                            results_task.get("acc_norm,none")
                            or results_task.get("acc_norm")
                            or results_task.get("acc")
                            or float("-inf")
                        )
                    elif results[task]['em,none'] > best_k_perf:
                        metric_val = (
                            results_task.get("em,none")
                            or results_task.get("em")
                            or results_task.get("f1,none")
                            or results_task.get("f1")
                            or float("-inf")
                        )
                if metric_val > best_k_perf:
                    best_k_perf = metric_val
                    best_k_shot = n_shot
                clean_gpu()
            print(f"Best performance with {best_k_shot}-shots, performance : {best_k_perf} on  {task_type}, task {task}")
            
            for decoding_strategy, ds_kwargs in icl_variants['decoding_strategy'].items():
                if decoding_strategy == "default": continue
                print(f"\nðŸ”¹ Evaluating model  : {model} on task {task} with ({best_k_shot}-shots), decoding_strategy : {decoding_strategy}")
                results = evaluator.simple_evaluate(
                    model=model_name,
                    model_args=model_args_str,
                    tasks=[task],
                    num_fewshot=best_k_shot,
                    gen_kwargs=ds_kwargs,
                    batch_size="auto",
                    device="auto",  
                )["results"]
                
                
                out_dir = os.path.join("results", model.replace("/", "_"), task)
                out_name = os.path.join(out_dir, f"{model.replace('/', '_')}_{best_k_shot}shot_{decoding_strategy}_ds.json")
                os.makedirs(out_dir, exist_ok=True)

                with open(out_name, "w") as f:
                    json.dump(results[task], f, indent=2)
                print(f"Saved to {out_name}")
                

2025-10-09:22:50:40 INFO     [evaluator:202] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2025-10-09:22:50:40 INFO     [evaluator:240] Initializing hf model, with arguments: {'pretrained': 'google/gemma-3-270m-it', 'tokenizer': 'google/gemma-3-270m-it', 'dtype': 'bfloat16',
        'trust_remote_code': True}
2025-10-09:22:50:40 INFO     [models.huggingface:155] Device not specified
2025-10-09:22:50:40 INFO     [models.huggingface:156] Cuda Available? True



ðŸ”¹ Evaluating model  : google/gemma-3-270m-it on task squad_qa with (0-shots), default strategy


2025-10-09:22:50:41 DEBUG    [models.huggingface:528] Using model type 'causal'
2025-10-09:22:50:42 INFO     [models.huggingface:414] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda'}
2025-10-09:22:50:43 INFO     [models.huggingface:254] Model type is 'gemma3_text', part of the Gemma family--a BOS token will be used as Gemma underperforms without it.


KeyError: 'squad_qa'