In [39]:
import pandas as pd
from pathlib import Path
import re
import yaml
from loguru import logger
import json

In [None]:
!pwd

In [None]:
df = pd.DataFrame(columns=["model", "task", "seed", "source", "micro-f1", "macro-f1", "acc", "Pearson", "qa-f1"])

# Iterate over directories
for day in Path("/Users/juliawunderle/translator/src/outputs/").iterdir():
    for run in day.iterdir():
        if day.name == "slurm_logs":
            continue
        logger.info(f"Processing: {run}")

        config_path = run / "hydra_conf/config.yaml"
        if config_path.exists():
            with open(config_path) as f:
                cfg = yaml.load(f, Loader=yaml.FullLoader)
                task_name = cfg["task"]["task_name"] if "task" in cfg and "task_name" in cfg["task"] else None

        if task_name is None:
            logger.warning(f"Skipping {run} as it has no task name")
            continue

        logger.info(f"Task name: {task_name}, Model name: {cfg['model']['model_name']}")

        if task_name == "similarity_pawsx":
            training_log_path = run / task_name / "training_logs" / "logfile.log"
            if not training_log_path.exists():
                logger.warning(f"Skipping {run} as it has no training log")
                continue
            with open(training_log_path, "r") as log_file:
                log_content = log_file.read()
                if "Cosine-Similarity Pearson" not in log_content:
                    logger.warning(f"Skipping {run} as it did not finish")
                    continue
                pearson = re.search(r"Cosine-Similarity Pearson: (\d+\.\d+)", log_content)
                if pearson:
                    pearson = pearson.group(1)
                else:
                    logger.warning(f"Skipping {run} as it has no Pearson value")
                    continue

                results = {"micro-f1": None, "macro-f1": None, "acc": None, "Pearson": pearson, "qa-f1": None}
        elif task_name == "mlqa":
            training_log_path = run / task_name / "training_logs" / "train.log"
            if not training_log_path.exists():
                logger.warning(f"Skipping {run} as it has no training log")
                continue
            with open(training_log_path, "r") as log_file:
                log_content = log_file.read()
                if "eval_f1_score" not in log_content:
                    logger.warning(f"Skipping {run} as it did not finish")
                    continue

                # Adjusted regex pattern for `eval_f1_score`
                qa_f1_match = re.search(r"eval_f1_score':\s*([\d.]+)", log_content)
                if qa_f1_match:
                    qa_f1 = qa_f1_match.group(1)
                else:
                    logger.warning(f"Skipping {run} as it has no QA-F1 value")
                    continue

                results = {"micro-f1": None, "macro-f1": None, "acc": None, "Pearson": None, "qa-f1": qa_f1}

        # FLair format
        else:
            training_log_path = run / task_name / "training_logs" / "training.log"
            if not training_log_path.exists():
                logger.warning(f"Skipping {run} as it has no training log")
                continue
            with open(training_log_path, "r") as log_file:
                log_content = log_file.read()
                if "Results:" not in log_content:
                    logger.warning(f"Skipping {run} as it did not finish")
                    continue

                micro_f1 = re.search(r"F-score \(micro\) (\d+\.\d+)", log_content)
                macro_f1 = re.search(r"F-score \(macro\) (\d+\.\d+)", log_content)
                acc = re.search(r"Accuracy (\d+\.\d+)", log_content)

                if micro_f1 and macro_f1 and acc:
                    results = {
                        "micro-f1": micro_f1.group(1),
                        "macro-f1": macro_f1.group(1),
                        "acc": acc.group(1),
                        "Pearson": None,
                        "qa-f1": None,
                    }
                else:
                    logger.warning(f"Skipping {run} as it has incomplete results")
                    continue

        df = pd.concat(
            [
                df,
                pd.DataFrame(
                    data=[
                        [
                            cfg["model"]["model_name"],
                            task_name,
                            cfg["seed"],
                            f"{day.name}/{run.name}",
                            results["micro-f1"],
                            results["macro-f1"],
                            results["acc"],
                            results["Pearson"],
                            results["qa-f1"],
                        ]
                    ],
                    columns=["model", "task", "seed", "source", "micro-f1", "macro-f1", "acc", "Pearson", "qa-f1"],
                ),
            ],
            ignore_index=True,
        )

df.sort_values(by=["task", "model", "seed"], inplace=True)
df

In [42]:
# Metric Mapping
metrics = {
    "argument_mining": "macro-f1",
    "db_aspect": "micro-f1",
    "engaging_comments": "macro-f1",
    "factclaiming_comments": "macro-f1",
    "germanquad": "qa-f1",
    "germeval_opinion": "micro-f1",
    "hotel_aspect": "micro-f1",
    "massive_intents": "micro-f1",
    "massive_seq": "micro-f1",
    "mlqa": "qa-f1",
    "ner_biofid": "micro-f1",
    "ner_europarl": "micro-f1",
    "ner_legal": "micro-f1",
    "ner_news": "micro-f1",
    "ner_wiki_news": "micro-f1",
    "nli": "acc",
    "offensive_lang": "macro-f1",
    "pawsx": "acc",
    "polarity": "micro-f1",
    "query_ad": "acc",
    "quest_ans": "acc",
    "similarity_pawsx": "Pearson",
    "topic_relevance": "micro-f1",
    "toxic_comments": "macro-f1",
    "up_dep": "micro-f1",
    "up_pos": "micro-f1",
    "verbal_idioms": "micro-f1",
    "webcage": "micro-f1",
}

In [43]:
# Add new results to existing results.json or create new results.json
new_records = df.to_dict(orient="records")

json_path = Path("/Users/juliawunderle/translator/src/evaluation/results_new.json")
if json_path.exists() and json_path.stat().st_size != 0:
    with open(json_path, "r") as f:
        data = json.load(f)
else:
    data = []

for new_record in new_records:
    model_name = new_record["model"].split("/")[1]
    if "gottbert" in model_name:
        model_name = "gottbert"
    task_name = new_record["task"]

    metric_type = metrics.get(task_name)

    if metric_type and new_record[metric_type] is not None:
        existing_record = next((item for item in data if item["model"] == model_name), None)

        if existing_record:
            existing_record[task_name] = new_record[metric_type]
        else:
            new_entry = {"model": model_name, task_name: new_record[metric_type]}
            data.append(new_entry)

with open(json_path, "w") as f:
    json.dump(data, f, indent=4)