In [1]:
import pandas as pd
from pathlib import Path
import re
import yaml
from loguru import logger
import json
import logging


In [2]:
!pwd

/Users/juliawunderle/SuperGLEBer/src/evaluation


In [3]:
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

base_path = Path("/Users/juliawunderle/Desktop/2025-06-18")

df = pd.DataFrame(columns=[
    "model", "task", "seed", "source", "micro-f1", "macro-f1", "acc", "Pearson", "qa-f1", "train_log_path"
])

# Iterate over run folders
for day in base_path.iterdir():
    if day.name == "slurm_logs" or not day.is_dir():
        continue

    for run in day.iterdir():
        if not run.is_dir():
            continue
        run = Path(str(run).replace("/hydra_conf", ""))
        logger.info(f"Processing: {run}")

        config_path = run / "hydra_conf/config.yaml"
        if not config_path.exists():
            logger.warning(f"Missing config for {run}")
            continue

        try:
            with open(config_path) as f:
                cfg = yaml.load(f, Loader=yaml.FullLoader)
        except Exception as e:
            logger.error(f"Failed to load config from {config_path}: {e}")
            continue

        task_name = cfg.get("task", {}).get("task_name", None)
        if task_name is None or task_name == "sustaineval_regression_classification" or task_name == "factclaiming_comments" or task_name == "ner_europarl":
            logger.warning(f"Skipping {run} as it has no task name")
            continue

        is_bidirectional = cfg.get("model", {}).get("is_bidirectional", False)
        model_name = cfg["model"].get("model_name", "unknown_model")
        if is_bidirectional:
            model_name += "_bidirectional"

        logger.info(f"Task name: {task_name}, Model name: {model_name}")

        results = {
            "micro-f1": None,
            "macro-f1": None,
            "acc": None,
            "Pearson": None,
            "qa-f1": None
        }

        log_path = run / task_name / "training_logs" / "training.log"
        if not log_path.exists():
            logger.warning(f"Skipping {run}, no training log")
            continue

        with open(log_path, "r") as f:
            log = f.read()

        if "f1-score (micro avg)" not in log:
            logger.warning(f"Skipping {run}, no micro-f1 content")
            continue

        matches = re.findall(r"DEV\s+:.*?f1-score \(micro avg\)\s+([0-9.]+)", log)
        if matches:
            results["micro-f1"] = matches[-1]
        else:
            logger.warning(f"Skipping {run}, no micro-f1 found")
            continue

        df = pd.concat(
            [
                df,
                pd.DataFrame(
                    data=[
                        [
                            model_name,
                            task_name,
                            cfg.get("seed", "N/A"),
                            f"{day.name}/{run.name}",
                            results["micro-f1"],
                            results["macro-f1"],
                            results["acc"],
                            results["Pearson"],
                            results["qa-f1"],
                            str(log_path)
                        ]
                    ],
                    columns=[
                        "model", "task", "seed", "source", "micro-f1", "macro-f1", "acc", "Pearson", "qa-f1", "train_log_path"
                    ],
                ),
            ],
            ignore_index=True,
        )


df.sort_values(by=["task", "model", "seed"], inplace=True)

INFO:__main__:Processing: /Users/juliawunderle/Desktop/2025-06-18/16-54-55.860960
INFO:__main__:Processing: /Users/juliawunderle/Desktop/2025-06-18/16-54-55.860960/flausch_classification
INFO:__main__:Processing: /Users/juliawunderle/Desktop/2025-06-18/15-07-34.117388
INFO:__main__:Processing: /Users/juliawunderle/Desktop/2025-06-18/15-07-34.117388/sustaineval_regression
INFO:__main__:Processing: /Users/juliawunderle/Desktop/2025-06-18/15-09-08.354510
INFO:__main__:Processing: /Users/juliawunderle/Desktop/2025-06-18/15-09-08.354510/sustaineval_regression
INFO:__main__:Processing: /Users/juliawunderle/Desktop/2025-06-18/17-53-55.607290
INFO:__main__:Processing: /Users/juliawunderle/Desktop/2025-06-18/17-53-55.607290/flausch_classification
INFO:__main__:Processing: /Users/juliawunderle/Desktop/2025-06-18/16-12-25.372796
INFO:__main__:Processing: /Users/juliawunderle/Desktop/2025-06-18/16-12-25.372796/flausch_tagging
INFO:__main__:Processing: /Users/juliawunderle/Desktop/2025-06-18/16-15-

In [4]:
JSON_PATH = Path("results.json")
TEX_PATH = Path("results.tex")
if JSON_PATH.exists() and JSON_PATH.stat().st_size != 0:
    with open(JSON_PATH, "r") as f:
        data = json.load(f)
else:
    data = []

new_records = df.to_dict(orient="records")

for new_record in new_records:
    model_name = new_record["model"]
    model_name = model_name.split("/")[1]
    task_name = new_record["task"]
    metric_type = "micro-f1"

    if new_record[metric_type] is not None:
        existing = next((item for item in data if item["model"] == model_name), None)
        value = {
            "metric": new_record[metric_type],
            "train_log_path": new_record.get("train_log_path", None)
        }
        if existing:
            existing[task_name] = value
        else:
            data.append({"model": model_name, task_name: value})

with open(JSON_PATH, "w") as f:
    json.dump(data, f, indent=4)

logger.info(f"Saved results to {JSON_PATH}")

# --- LaTeX Table Output ---
latex_data = []
for entry in data:
    new_entry = {"model": entry["model"]}
    for key, val in entry.items():
        if key == "model":
            continue
        # val is dict with metric & train_log_path, get just metric value
        if isinstance(val, dict):
            new_entry[key] = val.get("metric", None)
        else:
            new_entry[key] = val
    latex_data.append(new_entry)

latex_df = pd.DataFrame(latex_data).set_index("model").sort_index(axis=1).sort_index(axis=0)

latex_table = latex_df.to_latex(
    na_rep="--", float_format="%.3f",
    caption="Model performance across tasks", label="tab:results"
)

with open(TEX_PATH, "w") as f:
    f.write(latex_table)

logger.info("Saved LaTeX table to h200_26.tex")

INFO:__main__:Saved results to results.json
INFO:__main__:Saved LaTeX table to h200_26.tex
