In [1]:
# !pip install lm_eval
# !pip install -U bitsandbytes

In [2]:
import os
import json
import argparse
from datetime import datetime
import subprocess
import sys
import torch

In [3]:
HF_API_KEY = "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"

os.environ["HF_TOKEN"] = HF_API_KEY

device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
# !git config --global credential.helper store

In [5]:
# !hf auth login

In [6]:
# !git clone https://huggingface.co/MohamedAhmedAE/Llama-3.2-3B-Instruct-Medical-Finetune-v4

In [7]:
adapter_path= None
model_name = "m42-health/Llama3-Med42-8B"

In [8]:
# adapter_path= "MohamedAhmedAE/Llama-3.1-8B-Instruct-Medical-Finetune-v3" 
# model_name = "meta-llama/Llama-3.1-8B-Instruct"

# adapter_path= "MohamedAhmedAE/Llama-3.2-3B-Instruct-Medical-Finetune-v4" 
# model_name = "meta-llama/Llama-3.2-3B-Instruct"

# adapter_path= "MohamedAhmedAE/Llama-3.2-1B-Instruct-Medical-Finetune-v4" 
# model_name = "meta-llama/Llama-3.2-1B-Instruct"



In [9]:
# !ls ~/.cache/huggingface/hub


In [10]:
# !rm -rf /content/Llama-3.2-3B-Instruct-Medical-Finetune-v4


In [11]:
import os
import subprocess
import json


def run_mmlu_evaluation(
    model_name,
    output_dir="./mmlu_results",
    batch_size=8,
    device="cuda",
    dtype="float16",
    log_samples=True,
    specific_tasks=None,
    quantization=None,  # "8bit" or "4bit"
    num_fewshot=5,      # few-shot parameter
    adapter_path=None   # NEW: support PEFT adapters
):
    """
    Run MMLU evaluation using EleutherAI's lm-eval-harness.

    Args:
        model_name (str): Hugging Face model ID (base model).
        output_dir (str): Where to save results.
        batch_size (int): Evaluation batch size.
        device (str): Device (cuda / cpu).
        dtype (str): Data type (e.g., "float16").
        log_samples (bool): Save individual predictions.
        specific_tasks (list or str): Subset of tasks (default: "mmlu").
        quantization (str): "8bit" or "4bit" quantization.
        num_fewshot (int): Number of few-shot examples.
        adapter_path (str): Optional LoRA/PEFT adapter path.
    """

    os.makedirs(output_dir, exist_ok=True)

    # Build model_args
    if quantization == "8bit":
        model_args = f"pretrained={model_name},load_in_8bit=True,dtype={dtype}"
    elif quantization == "4bit":
        model_args = (
            f"pretrained={model_name},load_in_4bit=True,dtype={dtype},"
            "bnb_4bit_compute_dtype=float16,"
            "bnb_4bit_use_double_quant=True,"
            "bnb_4bit_quant_type=nf4"
        )
    else:
        model_args = f"pretrained={model_name},dtype={dtype}"

    # If using LoRA/PEFT adapter
    if adapter_path:
        model_args += f",peft={adapter_path}"

    # Build command
    cmd = [
        "lm_eval",
        "--model", "hf",
        "--model_args", model_args,
        "--batch_size", str(batch_size),
        "--device", device,
        "--output_path", output_dir,
        "--num_fewshot", str(num_fewshot),
    ]

    # Add tasks
    if specific_tasks:
        if isinstance(specific_tasks, str):
            cmd.extend(["--tasks", specific_tasks])
        else:
            cmd.extend(["--tasks", ",".join(specific_tasks)])
    else:
        cmd.extend(["--tasks", "mmlu"])

    # Log samples if requested
    if log_samples:
        cmd.append("--log_samples")

    # Debug info
    print(f"Running command: {' '.join(cmd)}")
    print(f"Evaluating model: {model_name}")
    if adapter_path:
        print(f"Using adapter: {adapter_path}")
    print(f"Output directory: {output_dir}")
    print(f"Quantization: {quantization or 'None'}")
    print(f"Few-shot setting: {num_fewshot}-shot")

    # Run evaluation
    try:
        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
        print("✅ Evaluation completed successfully!")
        print(result.stdout)

        # Save logs for reproducibility
        log_path = os.path.join(output_dir, "run_log.json")
        with open(log_path, "w") as f:
            json.dump(
                {
                    "command": " ".join(cmd),
                    "stdout": result.stdout,
                    "stderr": result.stderr,
                },
                f,
                indent=2,
            )
        print(f"Logs saved to {log_path}")
        return True
    except subprocess.CalledProcessError as e:
        print("❌ Error running evaluation")
        print(f"stderr: {e.stderr}")
        return False


In [12]:
def load_and_display_results(output_dir):
    """Load and display evaluation results"""
    results_file = os.path.join(output_dir, "results.json")

    if os.path.exists(results_file):
        with open(results_file, 'r') as f:
            results = json.load(f)

        print("\n" + "="*50)
        print("MMLU EVALUATION RESULTS")
        print("="*50)

        # Display overall results
        if 'results' in results:
            for task, metrics in results['results'].items():
                if 'acc' in metrics:
                    accuracy = metrics['acc'] * 100
                    print(f"{task}: {accuracy:.2f}%")

        # Calculate average if multiple tasks
        if 'results' in results and len(results['results']) > 1:
            accuracies = [metrics.get('acc', 0) for metrics in results['results'].values()
                         if 'acc' in metrics]
            if accuracies:
                avg_accuracy = sum(accuracies) / len(accuracies) * 100
                print(f"\nAverage Accuracy: {avg_accuracy:.2f}%")

        print("="*50)
        return results
    else:
        print(f"Results file not found: {results_file}")
        return None

In [13]:
# # Run evaluation with corrected task specification
# success = run_mmlu_evaluation(
#     output_dir="./mmlu_results",
#     model_name=model_name,
#     adapter_path=adapter_path,
#     batch_size=1,
#     device="cuda",
#     dtype="float16",
#     log_samples=True,
#     specific_tasks="mmlu_clinical_knowledge",  # This will now be handled correctly
#     quantization="8bit",
#     num_fewshot=0
# )

# if success:
#     # Load and display results
#     load_and_display_results("./mmlu_results")
# else:
#     print("Evaluation failed!")


In [14]:
def evaluate_specific_subjects(model_name, adapter_path):
    """Example for evaluating specific MMLU subjects"""

    # Common MMLU subjects
    subjects = [
        "mmlu_anatomy",
        "mmlu_clinical_knowledge",
        "mmlu_college_biology",
        "mmlu_college_medicine",
        "mmlu_medical_genetics",
        "mmlu_nutrition",
        "mmlu_professional_medicine",
        "mmlu_virology",
    ]
    
    success = run_mmlu_evaluation(
        model_name=model_name,
        adapter_path=adapter_path,
        output_dir="./mmlu_specific_subjects",
        batch_size=1,
        specific_tasks=subjects
    )

    return success

In [15]:
evaluate_specific_subjects(model_name, adapter_path)

Running command: lm_eval --model hf --model_args pretrained=m42-health/Llama3-Med42-8B,dtype=float16 --batch_size 1 --device cuda --output_path ./mmlu_specific_subjects --num_fewshot 5 --tasks mmlu_anatomy,mmlu_clinical_knowledge,mmlu_college_biology,mmlu_college_medicine,mmlu_medical_genetics,mmlu_nutrition,mmlu_professional_medicine,mmlu_virology --log_samples
Evaluating model: m42-health/Llama3-Med42-8B
Output directory: ./mmlu_specific_subjects
Quantization: None
Few-shot setting: 5-shot
✅ Evaluation completed successfully!
hf (pretrained=m42-health/Llama3-Med42-8B,dtype=float16), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: 1
|        Tasks        |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
|---------------------|------:|------|-----:|------|---|-----:|---|-----:|
|anatomy              |      1|none  |     5|acc   |↑  |0.6963|±  |0.0397|
|clinical_knowledge   |      1|none  |     5|acc   |↑  |0.7660|±  |0.0261|
|college_biology      |      1|none  |     5|

True