In [None]:
import pandas as pd

df_norm = pd.read_csv("data/processed/hf_normalized_scores.csv")

In [3]:
from datasets import load_dataset

# Load the Open LLM Leaderboard dataset from HuggingFace
dataset = load_dataset("open-llm-leaderboard")

# Inspect dataset structure
dataset

DatasetNotFoundError: Dataset 'open-llm-leaderboard' doesn't exist on the Hub or cannot be accessed.

In [4]:
import requests
import pandas as pd

URL = "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"

response = requests.get(URL)
response.raise_for_status()

data = response.json()

# Convert to DataFrame (one row per model entry)
df_raw = pd.json_normalize(data)

df_raw.head()

Unnamed: 0,id,model.name,model.sha,model.precision,model.type,model.weight_type,model.architecture,model.average_score,model.has_chat_template,evaluations.ifeval.name,...,features.is_flagged,features.is_official_provider,metadata.upload_date,metadata.submission_date,metadata.generation,metadata.base_model,metadata.hub_license,metadata.hub_hearts,metadata.params_billions,metadata.co2_cost
0,0-hero/Matter-0.2-7B-DPO_bfloat16_26a66f0d862e...,0-hero/Matter-0.2-7B-DPO,26a66f0d862e2024ce4ad0a09c37052ac36e8af6,bfloat16,chatmodels,Original,MistralForCausalLM,8.906361,True,IFEval,...,False,False,2024-04-13,2024-08-05,0,0-hero/Matter-0.2-7B-DPO,apache-2.0,3,7.242,1.219174
1,01-ai/Yi-1.5-34B_bfloat16_4b486f81c935a2dadde8...,01-ai/Yi-1.5-34B,4b486f81c935a2dadde84c6baa1e1370d40a098f,bfloat16,pretrained,Original,LlamaForCausalLM,25.646494,False,IFEval,...,False,True,2024-05-11,2024-06-12,0,01-ai/Yi-1.5-34B,apache-2.0,46,34.389,22.703398
2,01-ai/Yi-1.5-34B-32K_bfloat16_2c03a29761e4174f...,01-ai/Yi-1.5-34B-32K,2c03a29761e4174f20347a60fbe229be4383d48b,bfloat16,pretrained,Original,LlamaForCausalLM,26.727913,False,IFEval,...,False,True,2024-05-15,2024-06-12,0,01-ai/Yi-1.5-34B-32K,apache-2.0,36,34.389,23.154629
3,01-ai/Yi-1.5-34B-Chat_bfloat16_f3128b2d02d8298...,01-ai/Yi-1.5-34B-Chat,f3128b2d02d82989daae566c0a7eadc621ca3254,bfloat16,chatmodels,Original,LlamaForCausalLM,33.357994,True,IFEval,...,False,True,2024-05-10,2024-06-12,0,01-ai/Yi-1.5-34B-Chat,apache-2.0,268,34.389,22.423844
4,01-ai/Yi-1.5-34B-Chat-16K_bfloat16_ff74452e11f...,01-ai/Yi-1.5-34B-Chat-16K,ff74452e11f0f749ab872dc19b1dd3813c25c4d8,bfloat16,chatmodels,Original,LlamaForCausalLM,29.403555,True,IFEval,...,False,True,2024-05-15,2024-07-15,0,01-ai/Yi-1.5-34B-Chat-16K,apache-2.0,26,34.389,6.774022


In [5]:
# Show all columns related to evaluations
[col for col in df_raw.columns if col.startswith("evaluations.")]


['evaluations.ifeval.name',
 'evaluations.ifeval.value',
 'evaluations.ifeval.normalized_score',
 'evaluations.bbh.name',
 'evaluations.bbh.value',
 'evaluations.bbh.normalized_score',
 'evaluations.math.name',
 'evaluations.math.value',
 'evaluations.math.normalized_score',
 'evaluations.gpqa.name',
 'evaluations.gpqa.value',
 'evaluations.gpqa.normalized_score',
 'evaluations.musr.name',
 'evaluations.musr.value',
 'evaluations.musr.normalized_score',
 'evaluations.mmlu_pro.name',
 'evaluations.mmlu_pro.value',
 'evaluations.mmlu_pro.normalized_score']

In [6]:
# Select relevant columns for HF-native capability mapping

df_hf = df_raw[[
    "model.name",
    "evaluations.bbh.normalized_score",
    "evaluations.math.normalized_score",
    "evaluations.mmlu_pro.normalized_score",
    "evaluations.gpqa.normalized_score",
    "evaluations.ifeval.normalized_score",
]].copy()

# Rename columns to paper-friendly function names
df_hf.columns = [
    "model",
    "Reasoning",           # BBH
    "Problem-Solving",     # MATH
    "Learning",            # MMLU-PRO
    "Judgment",            # GPQA
    "Instruction-Following"  # IFEval
]

df_hf.head()

Unnamed: 0,model,Reasoning,Problem-Solving,Learning,Judgment,Instruction-Following
0,0-hero/Matter-0.2-7B-DPO,10.055525,1.435045,1.817376,1.230425,33.027921
1,01-ai/Yi-1.5-34B,42.749363,15.332326,40.732122,15.436242,28.411725
2,01-ai/Yi-1.5-34B-32K,43.381847,15.407855,41.212323,15.100671,31.186917
3,01-ai/Yi-1.5-34B-Chat,44.262826,27.719033,39.116061,15.324385,60.667584
4,01-ai/Yi-1.5-34B-Chat-16K,44.536157,21.374622,39.383865,11.744966,45.645


In [7]:
# Linear rescaling from HF normalized scores (~0–100) to 0–5 scale
df_norm = df_hf.copy()

for col in ["Reasoning", "Problem-Solving", "Learning", "Judgment", "Instruction-Following"]:
    df_norm[col] = df_norm[col] / 20.0

df_norm.head()

Unnamed: 0,model,Reasoning,Problem-Solving,Learning,Judgment,Instruction-Following
0,0-hero/Matter-0.2-7B-DPO,0.502776,0.071752,0.090869,0.061521,1.651396
1,01-ai/Yi-1.5-34B,2.137468,0.766616,2.036606,0.771812,1.420586
2,01-ai/Yi-1.5-34B-32K,2.169092,0.770393,2.060616,0.755034,1.559346
3,01-ai/Yi-1.5-34B-Chat,2.213141,1.385952,1.955803,0.766219,3.033379
4,01-ai/Yi-1.5-34B-Chat-16K,2.226808,1.068731,1.969193,0.587248,2.28225


In [8]:
from pathlib import Path

Path("data/processed").mkdir(parents=True, exist_ok=True)

In [9]:
df_norm.to_csv("data/processed/hf_normalized_scores.csv", index=False)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path

# Load normalized scores
df = pd.read_csv("data/processed/hf_normalized_scores.csv")

functions = [
    "Reasoning",
    "Problem-Solving",
    "Learning",
    "Judgment",
    "Instruction-Following"
]

def radar_plot(model_name, scores):
    angles = np.linspace(0, 2 * np.pi, len(functions), endpoint=False)
    scores = scores.tolist()
    scores += scores[:1]
    angles = np.concatenate((angles, [angles[0]]))

    fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
    ax.plot(angles, scores, linewidth=2)
    ax.fill(angles, scores, alpha=0.25)

    ax.set_thetagrids(np.degrees(angles[:-1]), functions)
    ax.set_ylim(0, 5)
    ax.set_title(model_name, size=14, pad=20)

    return fig

# Ensure output directory exists
Path("figures/capability_profiles").mkdir(parents=True, exist_ok=True)

for _, row in df.iterrows():
    scores = row[functions]
    fig = radar_plot(row["model"], scores)
    safe_model_name = row["model"].replace("/", "_")
    fig.savefig(f"figures/capability_profiles/{safe_model_name}.png")
    plt.close(fig)

In [4]:
# Define capability columns
capabilities = [
    "Reasoning",
    "Problem-Solving",
    "Learning",
    "Judgment",
    "Instruction-Following"
]

top_n = 3

top_models_per_capability = {}

for cap in capabilities:
    top_models = (
        df_norm[["model", cap]]
        .sort_values(by=cap, ascending=False)
        .head(top_n)
        .reset_index(drop=True)
    )
    top_models_per_capability[cap] = top_models

# Display results
for cap, table in top_models_per_capability.items():
    print(f"\nTop {top_n} models for {cap}:")
    display(table)


Top 3 models for Reasoning:


Unnamed: 0,model,Reasoning
0,qingy2024/Benchmaxx-Llama-3.2-1B-Instruct,3.834998
1,JungZoona/T3Q-qwen2.5-14b-v1.0-e3,3.27333
2,JungZoona/T3Q-Qwen2.5-14B-Instruct-1M-e3,3.27333



Top 3 models for Problem-Solving:


Unnamed: 0,model,Problem-Solving
0,nvidia/AceMath-72B-Instruct,3.572508
1,nvidia/AceMath-7B-Instruct,3.168429
2,nvidia/AceInstruct-72B,3.130665



Top 3 models for Learning:


Unnamed: 0,model,Learning
0,MaziyarPanahi/calme-3.2-instruct-78b,3.501681
1,MaziyarPanahi/calme-3.1-instruct-78b,3.436115
2,dfurman/CalmeRys-78B-Orpo-v0.1,3.340075



Top 3 models for Judgment:


Unnamed: 0,model,Judgment
0,Steelskull/L3.3-MS-Nevoria-70b,1.470917
1,Steelskull/L3.3-Nevoria-R1-70b,1.459732
2,Tarek07/Progenitor-V1.1-LLaMa-70B,1.387025



Top 3 models for Instruction-Following:


Unnamed: 0,model,Instruction-Following
0,meta-llama/Llama-3.3-70B-Instruct,4.498791
1,meta-llama/Llama-3.1-70B-Instruct,4.334427
2,MaziyarPanahi/calme-2.1-qwen2.5-72b,4.33118


In [3]:
import pandas as pd

df_norm = pd.read_csv("data/processed/hf_normalized_scores.csv")
df_norm.head()

Unnamed: 0,model,Reasoning,Problem-Solving,Learning,Judgment,Instruction-Following
0,0-hero/Matter-0.2-7B-DPO,0.502776,0.071752,0.090869,0.061521,1.651396
1,01-ai/Yi-1.5-34B,2.137468,0.766616,2.036606,0.771812,1.420586
2,01-ai/Yi-1.5-34B-32K,2.169092,0.770393,2.060616,0.755034,1.559346
3,01-ai/Yi-1.5-34B-Chat,2.213141,1.385952,1.955803,0.766219,3.033379
4,01-ai/Yi-1.5-34B-Chat-16K,2.226808,1.068731,1.969193,0.587248,2.28225


In [7]:
# Define capability columns
capabilities = [
    "Reasoning",
    "Problem-Solving",
    "Learning",
    "Judgment",
    "Instruction-Following"
]

top_n = 3

top_models_per_capability = {}

for cap in capabilities:
    top_models = (
        df_norm[["model", cap]]
        .sort_values(
            by=[cap, "model"],           # primary + tie-break
            ascending=[False, True]
        )
        .head(top_n)
        .reset_index(drop=True)
    )

    # Add explicit ranking (1, 2, 3)
    top_models.insert(0, "Rank", range(1, len(top_models) + 1))

    top_models_per_capability[cap] = top_models

# Display results
for cap, table in top_models_per_capability.items():
    print(f"\nTop {top_n} models for {cap}:")
    display(table)


Top 3 models for Reasoning:


Unnamed: 0,Rank,model,Reasoning
0,1,qingy2024/Benchmaxx-Llama-3.2-1B-Instruct,3.834998
1,2,JungZoona/T3Q-Qwen2.5-14B-Instruct-1M-e3,3.27333
2,3,JungZoona/T3Q-qwen2.5-14b-v1.0-e3,3.27333



Top 3 models for Problem-Solving:


Unnamed: 0,Rank,model,Problem-Solving
0,1,nvidia/AceMath-72B-Instruct,3.572508
1,2,nvidia/AceMath-7B-Instruct,3.168429
2,3,nvidia/AceInstruct-72B,3.130665



Top 3 models for Learning:


Unnamed: 0,Rank,model,Learning
0,1,MaziyarPanahi/calme-3.2-instruct-78b,3.501681
1,2,MaziyarPanahi/calme-3.1-instruct-78b,3.436115
2,3,dfurman/CalmeRys-78B-Orpo-v0.1,3.340075



Top 3 models for Judgment:


Unnamed: 0,Rank,model,Judgment
0,1,Steelskull/L3.3-MS-Nevoria-70b,1.470917
1,2,Steelskull/L3.3-Nevoria-R1-70b,1.459732
2,3,Tarek07/Progenitor-V1.1-LLaMa-70B,1.387025



Top 3 models for Instruction-Following:


Unnamed: 0,Rank,model,Instruction-Following
0,1,meta-llama/Llama-3.3-70B-Instruct,4.498791
1,2,meta-llama/Llama-3.1-70B-Instruct,4.334427
2,3,MaziyarPanahi/calme-2.1-qwen2.5-72b,4.33118


In [10]:
rows = []

for cap, table in top_models_per_capability.items():
    for _, row in table.iterrows():
        rows.append({
            "Capability": cap,
            "Rank": row["Rank"],
            "Model": row["model"],
            "Score (0–5)": row[cap]
        })

df_top3_ranked = pd.DataFrame(rows)
df_top3_ranked

Unnamed: 0,Capability,Rank,Model,Score (0–5)
0,Reasoning,1,qingy2024/Benchmaxx-Llama-3.2-1B-Instruct,3.834998
1,Reasoning,2,JungZoona/T3Q-Qwen2.5-14B-Instruct-1M-e3,3.27333
2,Reasoning,3,JungZoona/T3Q-qwen2.5-14b-v1.0-e3,3.27333
3,Problem-Solving,1,nvidia/AceMath-72B-Instruct,3.572508
4,Problem-Solving,2,nvidia/AceMath-7B-Instruct,3.168429
5,Problem-Solving,3,nvidia/AceInstruct-72B,3.130665
6,Learning,1,MaziyarPanahi/calme-3.2-instruct-78b,3.501681
7,Learning,2,MaziyarPanahi/calme-3.1-instruct-78b,3.436115
8,Learning,3,dfurman/CalmeRys-78B-Orpo-v0.1,3.340075
9,Judgment,1,Steelskull/L3.3-MS-Nevoria-70b,1.470917


In [11]:
df_top3_ranked.to_csv(
    "data/processed/top3_models_ranked_per_capability.csv",
    index=False
)

In [12]:
# Get Rank-1 model for each capability
rank1_models = {
    cap: table.loc[table["Rank"] == 1, "model"].iloc[0]
    for cap, table in top_models_per_capability.items()
}

# Deduplicate (same model may win multiple capabilities)
rank1_unique_models = sorted(set(rank1_models.values()))

rank1_unique_models

['MaziyarPanahi/calme-3.2-instruct-78b',
 'Steelskull/L3.3-MS-Nevoria-70b',
 'meta-llama/Llama-3.3-70B-Instruct',
 'nvidia/AceMath-72B-Instruct',
 'qingy2024/Benchmaxx-Llama-3.2-1B-Instruct']

In [13]:
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path

functions = [
    "Reasoning",
    "Problem-Solving",
    "Learning",
    "Judgment",
    "Instruction-Following"
]

def radar_plot(model_name, scores):
    angles = np.linspace(0, 2 * np.pi, len(functions), endpoint=False)
    scores = scores.tolist()
    scores += scores[:1]
    angles = np.concatenate((angles, [angles[0]]))

    fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
    ax.plot(angles, scores, linewidth=2)
    ax.fill(angles, scores, alpha=0.25)

    ax.set_thetagrids(np.degrees(angles[:-1]), functions)
    ax.set_ylim(0, 5)
    ax.set_title(model_name, size=14, pad=20)

    return fig

# Output directory for paper-ready figures
output_dir = Path("figures/capability_profiles_rank1")
output_dir.mkdir(parents=True, exist_ok=True)

for model in rank1_unique_models:
    row = df_norm[df_norm["model"] == model].iloc[0]
    scores = row[functions]

    fig = radar_plot(model, scores)
    safe_name = model.replace("/", "_")
    fig.savefig(output_dir / f"{safe_name}.png")
    plt.close(fig)

In [14]:
for cap, model in rank1_models.items():
    print(f"{cap}: {model}")

Reasoning: qingy2024/Benchmaxx-Llama-3.2-1B-Instruct
Problem-Solving: nvidia/AceMath-72B-Instruct
Learning: MaziyarPanahi/calme-3.2-instruct-78b
Judgment: Steelskull/L3.3-MS-Nevoria-70b
Instruction-Following: meta-llama/Llama-3.3-70B-Instruct


In [15]:
from pathlib import Path

# Where the Rank-1 plots live
fig_dir = Path("figures/capability_profiles_rank1")

# Canonical figure labels
figure_labels = {
    "Reasoning": "Figure1_Reasoning_BBH",
    "Problem-Solving": "Figure2_Problem-Solving_MATH",
    "Learning": "Figure3_Learning_MMLU-PRO",
    "Judgment": "Figure4_Judgment_GPQA",
    "Instruction-Following": "Figure5_Instruction-Following_IFEval"
}

for cap, model in rank1_models.items():
    safe_model_name = model.replace("/", "_")
    old_path = fig_dir / f"{safe_model_name}.png"
    new_path = fig_dir / f"{figure_labels[cap]}.png"

    if old_path.exists():
        old_path.rename(new_path)
        print(f"Renamed: {old_path.name} → {new_path.name}")
    else:
        print(f"WARNING: file not found for {model}")

Renamed: qingy2024_Benchmaxx-Llama-3.2-1B-Instruct.png → Figure1_Reasoning_BBH.png
Renamed: nvidia_AceMath-72B-Instruct.png → Figure2_Problem-Solving_MATH.png
Renamed: MaziyarPanahi_calme-3.2-instruct-78b.png → Figure3_Learning_MMLU-PRO.png
Renamed: Steelskull_L3.3-MS-Nevoria-70b.png → Figure4_Judgment_GPQA.png
Renamed: meta-llama_Llama-3.3-70B-Instruct.png → Figure5_Instruction-Following_IFEval.png
