In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def analyze_results(solver, J, K, N, p, data_dir="../data"):
    """
    Read the CSV for given parameters and compute:
      1) Average runtime across all simulations.
      2) Average runtime for branch==5 and branch==6, and SAT (= {5,6}).
      3) Proportion identifiable.
      4) (Optional) Plot the distribution of 'branch'.

    Robustness:
      - Skips rows with the wrong number of columns (malformed lines).
      - Coerces types and drops rows with invalid 'identifiable' or missing essentials.
    """
    filename = f"{data_dir}/solver{solver}_J{J}_K{K}_p{p}.csv"

    # Read as strings first, skip bad lines; then coerce
    df = pd.read_csv(
        filename,
        on_bad_lines="skip",   # skip malformed rows
        engine="python",       # needed for on_bad_lines with some files
        dtype=str,             # read all as str to sanitize ourselves
        skip_blank_lines=True
    )


    # Ensure required columns exist
    required = ["J","K","N","p","seed","sim","runtime","identifiable","branch"]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    # Helper to coerce identifiable to {0,1}, else NaN
    def parse_identifiable(x):
        if x is None:
            return np.nan
        if isinstance(x, bool):
            return int(x)
        s = str(x).strip().lower()
        if s in {"1", "true", "t", "yes", "y"}:
            return 1
        if s in {"0", "false", "f", "no", "n"}:
            return 0
        # Try numeric like "1.0", "0.0"
        try:
            v = float(s)
            if np.isnan(v):
                return np.nan
            # accept only near 0/1; otherwise treat as bad
            if abs(v - 1.0) < 1e-12:
                return 1
            if abs(v - 0.0) < 1e-12:
                return 0
        except Exception:
            pass
        return np.nan

    # Coerce columns
    num_cols = ["J","K","N","p","seed","sim","runtime","branch"]
    for c in num_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    df["identifiable"] = df["identifiable"].apply(parse_identifiable)

    # Drop rows that are unusable
    df = df.dropna(subset=["runtime", "branch", "identifiable"]).copy()

    # (Optional) sanity: cast to int where appropriate
    for c in ["J","K","N","seed","sim","branch","identifiable"]:
        if c in df.columns:
            # Only cast if no NaNs remain in that column
            if not df[c].isna().any():
                df[c] = df[c].astype(int)

    # 1. Average runtime
    avg_runtime = df["runtime"].mean()

    # 2. Average runtime for branch==5,6 and SAT
    branch5 = df[df["branch"] == 5]
    avg_runtime_branch5 = branch5["runtime"].mean() if not branch5.empty else float("nan")

    branch6 = df[df["branch"] == 6]
    avg_runtime_branch6 = branch6["runtime"].mean() if not branch6.empty else float("nan")

    SAT = df[df["branch"].isin([5, 6])]
    avg_runtime_SAT = SAT["runtime"].mean() if not SAT.empty else float("nan")

    # 3. Proportion identifiable
    prop_identifiable = df["identifiable"].mean()  # now guaranteed 0/1

    # Print summary
    summary = pd.DataFrame({
        "Metric": [
            "Average runtime (all sims)",
            "avg_runtime_SAT",
            "avg_runtime_branch5",
            "avg_runtime_branch6",
            "Proportion identifiable",
        ],
        "Value": [
            avg_runtime,
            avg_runtime_SAT,
            avg_runtime_branch5,
            avg_runtime_branch6,
            prop_identifiable,
        ],
    })
    print("\nSummary Statistics:")
    print(summary.to_string(index=False))

    # # 4. Plot distribution of branch (uncomment to show)
    # counts = df["branch"].value_counts().sort_index()
    # plt.figure(figsize=(8, 4))
    # counts.plot(kind="bar")
    # plt.xlabel("Branch value")
    # plt.ylabel("Count")
    # plt.title(f"Distribution of branch for solver={solver}, J={J}, K={K}, p={p}")
    # plt.tight_layout()
    # plt.show()

    return avg_runtime, avg_runtime_SAT, avg_runtime_branch5, avg_runtime_branch6, prop_identifiable


# 

In [5]:
analyze_results(solver = 1, J = 50, K = 5, N = 10, p = 0.5)


Summary Statistics:
                    Metric    Value
Average runtime (all sims) 0.002991
           avg_runtime_SAT 0.004050
       avg_runtime_branch5      NaN
       avg_runtime_branch6 0.004050
   Proportion identifiable 1.000000


(0.002990950537844526, 0.004049911238897741, nan, 0.004049911238897741, 1.0)

In [6]:
def multiple_results(solver, J, K, p_list):
    results = {
        'p': [],
        'avg_runtime_all': [],
        'prop_identifiable': []
    }
    
    for p in p_list:
        avg_all, _, _, _, prop_id = analyze_results(solver, J, K, N, p)
        results['p'].append(p)
        results['avg_runtime_all'].append(avg_all)
        results['prop_identifiable'].append(prop_id)
    
    df = pd.DataFrame(results)
    return df

In [8]:
import itertools
import pandas as pd

def multiple_results_grid(solver, J_list, K_list, p_list, N):
    """
    Evaluate analyze_results() on every combination of J, K and p.

    Parameters
    ----------
    solver   : any
        The solver object (passed through to analyze_results).
    J_list   : iterable of int
        Candidate numbers of items.
    K_list   : iterable of int
        Candidate numbers of latent attributes.
    p_list   : iterable of float
        Candidate Bernoulli densities for generating Q.

    Returns
    -------
    pd.DataFrame
        Columns: J, K, p, avg_runtime_all, prop_identifiable.
        One row per (J, K, p) triple.
    """
    rows = []

    # Cartesian product over all settings
    for J, K, p in itertools.product(J_list, K_list, p_list):
        avg_all, _, _, _, prop_id = analyze_results(solver, J, K, N, p)
        rows.append(
            dict(J=J,
                 K=K,
                 p=p,
                 avg_runtime_all=avg_all,
                 prop_identifiable=prop_id)
        )

    return pd.DataFrame(rows)

In [9]:
solver = 1
K_list = [5, 10]
p_list = [0.1, 0.3, 0.5, 0.7, 0.9]
J_list = [25, 50, 100]
N = 10
multiple_results_grid(solver, J_list, K_list, p_list, N)


Summary Statistics:
                    Metric    Value
Average runtime (all sims) 0.000715
           avg_runtime_SAT 0.003314
       avg_runtime_branch5      NaN
       avg_runtime_branch6 0.003314
   Proportion identifiable 0.443102

Summary Statistics:
                    Metric    Value
Average runtime (all sims) 0.002088
           avg_runtime_SAT 0.003392
       avg_runtime_branch5      NaN
       avg_runtime_branch6 0.003392
   Proportion identifiable 0.952133

Summary Statistics:
                    Metric    Value
Average runtime (all sims) 0.003552
           avg_runtime_SAT 0.003707
       avg_runtime_branch5      NaN
       avg_runtime_branch6 0.003707
   Proportion identifiable 0.982456

Summary Statistics:
                    Metric    Value
Average runtime (all sims) 0.003071
           avg_runtime_SAT 0.003329
       avg_runtime_branch5 0.003146
       avg_runtime_branch6 0.003352
   Proportion identifiable 0.803279

Summary Statistics:
                    Metric   Va

Unnamed: 0,J,K,p,avg_runtime_all,prop_identifiable
0,25,5,0.1,0.000715,0.443102
1,25,5,0.3,0.002088,0.952133
2,25,5,0.5,0.003552,0.982456
3,25,5,0.7,0.003071,0.803279
4,25,5,0.9,0.00057,0.0
5,25,10,0.1,0.010802,0.075681
6,25,10,0.3,0.125884,0.822289
7,25,10,0.5,0.143212,0.932866
8,25,10,0.7,0.435777,0.051153
9,25,10,0.9,0.000851,0.0


In [None]:
multiple_results_grid(solver, J_list, K_list, p_list)