In [7]:
! pip install 'git+https://github.com/EleutherAI/lm-evaluation-harness.git'

! pip install torch --index-url https://download.pytorch.org/whl/cu121   # or cpu / cu118 as you wish
! pip install transformers accelerate sentencepiece

# LiteLLM proxy (for Gemini) and Google SDK
! pip install litellm google-generativeai --upgrade
! pip install openai transformers accelerate

Collecting git+https://github.com/EleutherAI/lm-evaluation-harness.git
  Cloning https://github.com/EleutherAI/lm-evaluation-harness.git to /tmp/pip-req-build-vhur44lz
  Running command git clone --filter=blob:none --quiet https://github.com/EleutherAI/lm-evaluation-harness.git /tmp/pip-req-build-vhur44lz
  Resolved https://github.com/EleutherAI/lm-evaluation-harness.git to commit 29ea6832cd913b055ec1d6962180c773e8a7ac88
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting accelerate>=0.26.0 (from lm_eval==0.4.8)
  Downloading accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate (from lm_eval==0.4.8)
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.16.0 (from lm_eval==0.4.8)
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting jsonlines (from lm_eval==0.4.8)
  Downloading jsonlines-4

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "1,2,3"
import json, pathlib, sys
import pandas as pd

In [3]:
import json
import glob
import pandas as pd

In [5]:
# -- CONFIGURE THIS: top‐level folder containing one subfolder per model
RESULTS_ROOT = "./my_tasks/results_qwen_2_7b/"
# e.g. RESULTS_ROOT/
#           ├─ gemma__google-gemma-7b/
#           ├─ mistralai__Mistral-7B-Instruct-v0.1/
#           ├─ ... etc

def load_model_df(model_dir):
    # find the .jsonl sample file
    files = glob.glob(os.path.join(model_dir, "*.jsonl"))
    if not files:
        raise FileNotFoundError(f"No .jsonl samples found in {model_dir}")
    # pick the first or use a naming heuristic
    samples_file = sorted(files)[0]
    
    records = []
    with open(samples_file, "r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            subset = obj.get("metadata", {}).get("subset", "UNKNOWN")
            acc    = obj.get("result", {}).get("acc", 0.0)
            records.append({"subset": subset, "acc": acc})
    
    df = pd.DataFrame(records)
    agg = (
        df
        .groupby("subset")
        .agg(total=("acc", "size"), correct=("acc", "sum"))
        .reset_index()
    )
    agg["accuracy"] = agg["correct"] / agg["total"]
    return agg

In [6]:
def main():
    all_results = {}
    # iterate through each model's directory
    for entry in os.listdir(RESULTS_ROOT):
        model_dir = os.path.join(RESULTS_ROOT, entry)
        if not os.path.isdir(model_dir):
            continue
        try:
            df = load_model_df(model_dir)
            model_name = entry.split("__")[1] if "__" in entry else entry
            all_results[model_name] = df
        except Exception as e:
            print(f"⚠️ Skipping {entry}: {e}")

    # Now you have a dict: { model_name: DataFrame }
    # For example, to print each:
    for model, df in all_results.items():
        print(f"\n=== {model} ===")
        print(df.to_string(index=False))

    # Optionally, concatenate all into one big DataFrame:
    combined = (
        pd.concat(
            [df.assign(model=model) for model, df in all_results.items()],
            ignore_index=True
        )
        .pivot(index="subset", columns="model", values="accuracy")
        .fillna(0.0)
    )
    print("\n=== Combined accuracy matrix ===")
    print(combined)

In [7]:
main()


=== Qwen2-7B-Instruct ===
 subset  total  correct  accuracy
UNKNOWN   4590      0.0       0.0

=== Combined accuracy matrix ===
model    Qwen2-7B-Instruct
subset                    
UNKNOWN                0.0


In [8]:
df = pd.read_json(, lines=True)

In [86]:
jsonl_path = "./my_tasks/agro/results_agro_llama/VityaVitalich__Llama3.1-8b-instruct/samples_agro_mc_2025-05-27T19-58-45.920956.jsonl"

In [87]:
records = []
with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        subset = obj.get("doc", {}).get("subset", "UNKNOWN")
        acc = obj.get("acc", 0.0)
        records.append({"subset": subset, "acc": acc})

# Create DataFrame and compute metrics
df = pd.DataFrame(records)
grouped = df.groupby("subset").agg(
    total_examples=("acc", "size"),
    correct_predictions=("acc", "sum")
).reset_index()
grouped["accuracy"] = grouped["correct_predictions"] / grouped["total_examples"]

# Compute overall accuracy
total_examples = grouped["total_examples"].sum()
total_correct = grouped["correct_predictions"].sum()
overall_accuracy = total_correct / total_examples if total_examples > 0 else 0.0

# Append overall row using concat
overall_row = pd.DataFrame([{
    "subset": "overall_accuracy",
    "total_examples": total_examples,
    "correct_predictions": total_correct,
    "accuracy": overall_accuracy
}])
result = pd.concat([grouped, overall_row], ignore_index=True)
result['correct_predictions'] = result['correct_predictions'].astype(int)

In [88]:
result

Unnamed: 0,subset,total_examples,correct_predictions,accuracy
0,Ботаника,227,52,0.229075
1,Индустриальная аквакультура,251,89,0.354582
2,"Ихтиопатология: ветеринария, профилактика и оп...",246,53,0.215447
3,Кормление рыбы и других гидробионтов,259,68,0.262548
4,Кормопроизводство и луговодство,270,47,0.174074
5,"Марикультура. Разведение раков, креветок. Иску...",236,40,0.169492
6,Мелиоративное земледелие,270,99,0.366667
7,Общая генетика,260,76,0.292308
8,Общее земледелие,274,70,0.255474
9,Основы селекции,259,78,0.301158


In [75]:
sheet_name="Llama3.1-8b-instruct"
existing_excel = "agro_mc.xlsx"
with pd.ExcelWriter(existing_excel, engine="openpyxl") as writer:
    grouped.to_excel(writer,
                     sheet_name=sheet_name,
                     index=False,
                     float_format="%.4f")

print(f"Saved subset accuracies to '{existing_excel}' on sheet {sheet_name}")

Saved subset accuracies to 'agro_mc.xlsx' on sheet Llama3.1-8b-instruct


In [89]:
from openpyxl import load_workbook

sheet_name="Llama3.1-8b-instruct"
existing_excel = "agro_mc.xlsx"
book = load_workbook(existing_excel)

with pd.ExcelWriter(existing_excel, engine="openpyxl", mode="a", if_sheet_exists="replace") as writer:

    result.to_excel(
        writer,
        sheet_name=sheet_name,
        index=False,
        float_format="%.4f"
    )

print(f"Saved subset accuracies to '{output_path}' on sheet {sheet_name}")

Saved subset accuracies to 'med_mc.xlsx' on sheet Llama3.1-8b-instruct
