In [11]:
import os
import pandas as pd

# --- Configuración global ---
BASE_PATHS = {
    "fam-sim": "models/fam-sim/2000_steps/",
    "rnadist-100": "models/rnadist/dist_100/2000_steps/",
    "rnadist-200": "models/rnadist/dist_200/2000_steps/",
    "rnadist-400": "models/rnadist/dist_400/2000_steps/",
}


SAVE_PATH = "results/tables/2000s/"
SAVE_FILE = "rnadist_2000s_f1_post_test.csv"

### Verificar el nombre de test.csv (5/10e)
### Revisar si hay que agregar epochs en (2)

In [12]:
# --- 1. Utilidades de carga de datos ---
def load_csv_if_exists(csv_path: str, warn_msg: str) -> pd.DataFrame | None:
    """Carga un CSV si existe; muestra un warning si no."""
    if os.path.isfile(csv_path):
        return pd.read_csv(csv_path)
    else:
        print(f"[WARN] {warn_msg}")
        return None


def collect_results_from_path(
    path: str, source: str
) -> tuple[list[pd.DataFrame], list[pd.DataFrame]]:
    """Recorre un path y carga todos los CSVs de entrenamiento y test."""
    train_logs, test_results = [], []

    if not os.path.exists(path):
        print(f"[WARN] Ruta no encontrada: {path}")
        return train_logs, test_results

    for fam in os.listdir(path):
        fam_path = os.path.join(path, fam)
        train_csv = os.path.join(fam_path, "train_log.csv")
        test_csv = os.path.join(fam_path, "test_2000steps.csv")

        df_train = load_csv_if_exists(
            train_csv, f"No se encontró train_log.csv en {fam_path}"
        )
        if df_train is not None:
            df_train["fam"] = fam
            df_train["source"] = source
            train_logs.append(df_train)

        df_test = load_csv_if_exists(
            test_csv, f"No se encontró test_2000steps.csv en {fam_path}"
        )
        if df_test is not None:
            df_test["fam"] = fam
            df_test["source"] = source
            test_results.append(df_test)

    return train_logs, test_results


def load_all_results(base_paths: dict) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Carga y concatena todos los resultados desde las rutas base."""
    all_train_logs, all_test_results = [], []

    for source, path in base_paths.items():
        train_logs, test_results = collect_results_from_path(path, source)
        all_train_logs.extend(train_logs)
        all_test_results.extend(test_results)

    train_df = pd.concat(all_train_logs, ignore_index=True)
    test_df = pd.concat(all_test_results, ignore_index=True)
    return train_df, test_df


# --- 2. Preprocesamiento ---
def adjust_epochs(df: pd.DataFrame, epoch_offset: int = 10) -> pd.DataFrame:
    """Ajusta los números de epoch sumando un offset."""
    df = df.copy()
    df["epoch"] += epoch_offset
    return df


# --- 3. Agregación y resumen ---
def summarize_metric(df: pd.DataFrame, metric: str) -> pd.DataFrame:
    """Agrega resultados por familia y fuente usando la métrica dada."""
    summary = df.groupby(["fam", "source"])[metric].mean().reset_index()
    return summary


def build_pivot_table(
    summary_df: pd.DataFrame, column_order: list[str]
) -> pd.DataFrame:
    """Construye una tabla pivot ordenada por las columnas especificadas."""
    pivot = summary_df.pivot(
        index="fam", columns="source", values=summary_df.columns[-1]
    )
    return pivot[column_order]


# --- 4. Pipeline principal ---
def main(metric: str, column_order: list[str]):
    # Carga resultados
    train_logs_df, test_results_df = load_all_results(BASE_PATHS)

    # Preprocesa
    train_logs_df = adjust_epochs(train_logs_df, 0)

    # Agrega y muestra resultados
    summary_df = summarize_metric(test_results_df, metric)
    pivot_df = build_pivot_table(summary_df, column_order)
    print(pivot_df)
    return pivot_df


# --- Ejecución ---
if __name__ == "__main__":
    metric_to_use = "f1_post"
    desired_column_order = list(BASE_PATHS.keys())
    pivot_table = main(metric_to_use, desired_column_order)

source      fam-sim  rnadist-100  rnadist-200  rnadist-400
fam                                                       
16s           0.322        0.340        0.360        0.381
23s           0.423        0.432        0.425        0.449
5s            0.417        0.355        0.487        0.423
RNaseP        0.345        0.419        0.370        0.454
grp1          0.365        0.303        0.276        0.388
srp           0.236        0.209        0.286        0.250
tRNA          0.434        0.589        0.669        0.646
telomerase    0.211        0.177        0.115        0.218
tmRNA         0.320        0.356        0.333        0.359


In [13]:
pivot_table.to_csv(SAVE_PATH + SAVE_FILE)