In [1]:
import json, os
import pandas as pd
from scipy.stats import wilcoxon

In [2]:
MULTI_JSON_PATH = "lstm-genres-and-keywords-grid_search_results.json"    
GENRES_JSON_PATH = "lstm-genres-grid_search_results.json"     
KEYWORDS_JSON_PATH = "lstm-keyword-grid_search_results.json" 

SINGLE_CFG = {"dropout_rate": 0.5, "lstm_units": 64}
MULTI_CFG  = {"dropout_rate": 0.5, "lstm_units": 128}

N_FOLDS = 5

In [3]:
def load_json(path_or_str: str):
    """Ładuje JSON z pliku (jeśli istnieje) albo z tekstu JSON."""
    if os.path.exists(path_or_str):
        with open(path_or_str, "r", encoding="utf-8") as f:
            return json.load(f)
    return json.loads(path_or_str)

def cfg_equal(a: dict, b: dict) -> bool:
    return a.get("dropout_rate") == b.get("dropout_rate") and a.get("lstm_units") == b.get("lstm_units")

def pick_entry_for_cfg(grid_json: list, cfg: dict) -> dict:
    matches = [e for e in grid_json if cfg_equal(e.get("params", {}), cfg)]
    if not matches:
        raise ValueError(f"Nie znaleziono wpisu dla cfg={cfg}. Dostępne: {[e.get('params') for e in grid_json]}")
    matches = sorted(matches, key=lambda x: x.get("timestamp", ""))
    return matches[-1]

def extract_single_last_folds(entry: dict, n_folds: int = 5) -> pd.DataFrame:
    """
    Single-task: fold_results często zawiera dane z poprzednich konfiguracji.
    Ponieważ fold_results NIE ma params, bierzemy ostatnie n_folds rekordów.
    """
    fr = entry["fold_results"]
    if len(fr) < n_folds:
        raise ValueError(f"Za mało fold_results: {len(fr)} < {n_folds}")
    fr = fr[-n_folds:]
    df = pd.DataFrame(fr)
    df["fold"] = df["fold"].astype(int)
    return df.sort_values("fold").reset_index(drop=True)

def extract_multi_for_cfg(entry: dict, cfg: dict) -> pd.DataFrame:
    """Multi-task: fold_results ma params per fold, filtrujemy po cfg."""
    fr = [r for r in entry["fold_results"] if cfg_equal(r.get("params", {}), cfg)]
    if not fr:
        raise ValueError("Nie znalazłem fold_results dla MULTI_CFG w multi JSON.")
    df = pd.DataFrame(fr)
    df["fold"] = df["fold"].astype(int)
    return df.sort_values("fold").reset_index(drop=True)

def rank_biserial(diffs):
    diffs = [d for d in diffs if d != 0]
    if not diffs:
        return 0.0
    pairs = sorted([(abs(d), d) for d in diffs], key=lambda x: x[0])
    ranks = list(range(1, len(pairs) + 1))
    w_plus = sum(r for r, (_, d) in zip(ranks, pairs) if d > 0)
    w_minus = sum(r for r, (_, d) in zip(ranks, pairs) if d < 0)
    return (w_plus - w_minus) / (w_plus + w_minus)

def wilcoxon_pack(multi_vals: pd.Series, single_vals: pd.Series, label: str):
    multi_vals = multi_vals.reset_index(drop=True)
    single_vals = single_vals.reset_index(drop=True)
    diffs = (multi_vals - single_vals).tolist()

    out = {
        "metric": label,
        "n": len(diffs),
        "mean_multi": float(multi_vals.mean()),
        "mean_single": float(single_vals.mean()),
        "mean_diff": float((multi_vals - single_vals).mean()),
        "rbc": float(rank_biserial(diffs))
    }

    # two-sided
    stat2, p2 = wilcoxon(multi_vals, single_vals, alternative="two-sided", zero_method="wilcox")
    # one-sided: multi > single
    statg, pg = wilcoxon(multi_vals, single_vals, alternative="greater", zero_method="wilcox")

    out.update({
        "W_two_sided": float(stat2),
        "p_two_sided": float(p2),
        "W_greater": float(statg),
        "p_greater": float(pg),
    })
    return out

In [4]:


def load_json(path_or_str: str):
    """Ładuje JSON z pliku (jeśli istnieje) albo z tekstu JSON."""
    if os.path.exists(path_or_str):
        with open(path_or_str, "r", encoding="utf-8") as f:
            return json.load(f)
    return json.loads(path_or_str)

def cfg_equal(a: dict, b: dict) -> bool:
    return a.get("dropout_rate") == b.get("dropout_rate") and a.get("lstm_units") == b.get("lstm_units")

def pick_entry_for_cfg(grid_json: list, cfg: dict) -> dict:
    matches = [e for e in grid_json if cfg_equal(e.get("params", {}), cfg)]
    if not matches:
        raise ValueError(f"Nie znaleziono wpisu dla cfg={cfg}. Dostępne: {[e.get('params') for e in grid_json]}")
    matches = sorted(matches, key=lambda x: x.get("timestamp", ""))
    return matches[-1]

def extract_single_last_folds(entry: dict, n_folds: int = 5) -> pd.DataFrame:
    """
    Single-task: fold_results często zawiera dane z poprzednich konfiguracji.
    Ponieważ fold_results NIE ma params, bierzemy ostatnie n_folds rekordów.
    """
    fr = entry["fold_results"]
    if len(fr) < n_folds:
        raise ValueError(f"Za mało fold_results: {len(fr)} < {n_folds}")
    fr = fr[-n_folds:]
    df = pd.DataFrame(fr)
    df["fold"] = df["fold"].astype(int)
    return df.sort_values("fold").reset_index(drop=True)

def extract_multi_for_cfg(entry: dict, cfg: dict) -> pd.DataFrame:
    """Multi-task: fold_results ma params per fold, filtrujemy po cfg."""
    fr = [r for r in entry["fold_results"] if cfg_equal(r.get("params", {}), cfg)]
    if not fr:
        raise ValueError("Nie znalazłem fold_results dla MULTI_CFG w multi JSON.")
    df = pd.DataFrame(fr)
    df["fold"] = df["fold"].astype(int)
    return df.sort_values("fold").reset_index(drop=True)

def rank_biserial(diffs):
    diffs = [d for d in diffs if d != 0]
    if not diffs:
        return 0.0
    pairs = sorted([(abs(d), d) for d in diffs], key=lambda x: x[0])
    ranks = list(range(1, len(pairs) + 1))
    w_plus = sum(r for r, (_, d) in zip(ranks, pairs) if d > 0)
    w_minus = sum(r for r, (_, d) in zip(ranks, pairs) if d < 0)
    return (w_plus - w_minus) / (w_plus + w_minus)

def wilcoxon_pack(multi_vals: pd.Series, single_vals: pd.Series, label: str):
    multi_vals = multi_vals.reset_index(drop=True)
    single_vals = single_vals.reset_index(drop=True)
    diffs = (multi_vals - single_vals).tolist()

    out = {
        "metric": label,
        "n": len(diffs),
        "mean_multi": float(multi_vals.mean()),
        "mean_single": float(single_vals.mean()),
        "mean_diff": float((multi_vals - single_vals).mean()),
        "rbc": float(rank_biserial(diffs))
    }

    # two-sided
    stat2, p2 = wilcoxon(multi_vals, single_vals, alternative="two-sided", zero_method="wilcox")
    # one-sided: multi > single
    statg, pg = wilcoxon(multi_vals, single_vals, alternative="greater", zero_method="wilcox")

    out.update({
        "W_two_sided": float(stat2),
        "p_two_sided": float(p2),
        "W_greater": float(statg),
        "p_greater": float(pg),
    })
    return out



In [5]:

multi_json = load_json(MULTI_JSON_PATH)
genres_json = load_json(GENRES_JSON_PATH)
keywords_json = load_json(KEYWORDS_JSON_PATH)

multi_entry = pick_entry_for_cfg(multi_json, MULTI_CFG)
genres_entry = pick_entry_for_cfg(genres_json, SINGLE_CFG)
keywords_entry = pick_entry_for_cfg(keywords_json, SINGLE_CFG)

df_multi = extract_multi_for_cfg(multi_entry, MULTI_CFG)
df_genres = extract_single_last_folds(genres_entry, N_FOLDS)
df_keywords = extract_single_last_folds(keywords_entry, N_FOLDS)


In [6]:
df_single = pd.DataFrame({
    "fold": df_genres["fold"],
    "single_f1_macro_combined": (df_genres["f1_macro"] + df_keywords["f1_macro"]) / 2.0,
    "single_jaccard_combined": (df_genres["jaccard_samples"] + df_keywords["jaccard_samples"]) / 2.0,
    "single_f1_combined": (df_genres["f1"] + df_keywords["f1"]) / 2.0,
}).sort_values("fold").reset_index(drop=True)

df_cmp = df_single.merge(
    df_multi[["fold", "f1_macro_combined", "jaccard_combined", "f1_combined"]],
    on="fold",
    how="inner"
).sort_values("fold").reset_index(drop=True)

In [7]:

df_cmp["diff_f1_macro_combined"] = df_cmp["f1_macro_combined"] - df_cmp["single_f1_macro_combined"]
df_cmp["diff_jaccard_combined"]  = df_cmp["jaccard_combined"]  - df_cmp["single_jaccard_combined"]
df_cmp["diff_f1_combined"]       = df_cmp["f1_combined"]       - df_cmp["single_f1_combined"]

print("CFG (single):", SINGLE_CFG)
print("CFG (multi) :", MULTI_CFG)
print("\nFold-level comparison:")
print(df_cmp)

reports = [
    wilcoxon_pack(df_cmp["f1_macro_combined"], df_cmp["single_f1_macro_combined"], "F1 macro (combined)"),
    wilcoxon_pack(df_cmp["jaccard_combined"],  df_cmp["single_jaccard_combined"],  "Jaccard (combined)"),
    wilcoxon_pack(df_cmp["f1_combined"],       df_cmp["single_f1_combined"],       "F1 samples (combined)"),
]
df_report = pd.DataFrame(reports)

print("\nWilcoxon signed-rank results:")
print(df_report)

CFG (single): {'dropout_rate': 0.5, 'lstm_units': 64}
CFG (multi) : {'dropout_rate': 0.5, 'lstm_units': 128}

Fold-level comparison:
   fold  single_f1_macro_combined  single_jaccard_combined  \
0     1                  0.708001                 0.621763   
1     2                  0.709834                 0.616487   
2     3                  0.710012                 0.618957   
3     4                  0.689864                 0.604017   
4     5                  0.705019                 0.620998   

   single_f1_combined  f1_macro_combined  jaccard_combined  f1_combined  \
0            0.731575           0.712210          0.619884     0.730235   
1            0.726769           0.696739          0.603559     0.718755   
2            0.729791           0.678652          0.607120     0.721678   
3            0.722798           0.708980          0.619837     0.730221   
4            0.732416           0.702377          0.610403     0.723088   

   diff_f1_macro_combined  diff_jaccard_com