In [1]:
import re
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 250)
pd.set_option("display.width", 220)

CSV = "/Users/abubakarialidu/Desktop/Data Result/all_sessions_cleaned.csv"

METHOD_ORDER = [
    "Direct (Non-Reasoning)",
    "Prompt2DAG (Template)",
    "Prompt2DAG (LLM)",
    "Prompt2DAG (Hybrid)",
    "Direct (Reasoning)",
]

ORCH_ORDER = ["airflow", "dagster", "prefect"]

def classify_method(row):
    workflow = row.get("Workflow", "")
    strategy = str(row.get("Strategy") or "").lower()
    if workflow == "Direct":
        return "Direct (Non-Reasoning)"
    if workflow == "Reasoning":
        return "Direct (Reasoning)"
    if workflow == "Prompt2DAG":
        if "template" in strategy:
            return "Prompt2DAG (Template)"
        if "llm" in strategy:
            return "Prompt2DAG (LLM)"
        if "hybrid" in strategy:
            return "Prompt2DAG (Hybrid)"
        return f"Prompt2DAG ({row.get('Strategy','Unknown')})"
    return workflow

def normalize_llm_name(x):
    if pd.isna(x):
        return "unknown"
    s = str(x).strip()
    s = s.replace("deepinfra:", "deepinfra-")
    s = s.replace(":", "-")
    return s

def infer_repeat_id(row):
    """
    Best-effort extraction of a repetition index from Run_Name (or similar).
    If you have an explicit repeat/seed column, replace this with it.
    """
    rn = str(row.get("Run_Name", "") or "")
    m = re.search(r"(?:rep|repeat|run|trial)[\-_ ]?(\d+)\b", rn.lower())
    if m:
        return int(m.group(1))
    # fallback: use Session+Run_Name uniqueness (still allows counting repeats via group sizes)
    return np.nan

df = pd.read_csv(CSV)

# Basic normalization
df["Method"] = df.apply(classify_method, axis=1)
df = df[df["Method"].isin(METHOD_ORDER)].copy()

df["Orchestrator"] = df["Orchestrator"].astype(str).str.lower().str.strip()
df["Pipeline_ID"] = df["Pipeline_ID"].astype(str)

# LLM used in the workflow (Std for non-reasoning and Prompt2DAG; Reasoning_LLM for Direct Reasoning)
df["Std_LLM"] = df.get("Std_LLM", "unknown")
df["Reasoning_LLM"] = df.get("Reasoning_LLM", "unknown")

df["LLM_for_Workflow"] = np.where(
    df["Method"] == "Direct (Reasoning)",
    df["Reasoning_LLM"].map(normalize_llm_name),
    df["Std_LLM"].map(normalize_llm_name),
)

# Optional: infer repeat IDs from names if present
df["Repeat_ID_guess"] = df.apply(infer_repeat_id, axis=1)

print("=" * 120)
print("RUN COUNT RECONCILIATION")
print("=" * 120)

# 1) Basic totals
print("\n[1] Total rows:", len(df))
print("Pipelines:", df["Pipeline_ID"].nunique())
print("Orchestrators:", df["Orchestrator"].nunique(), sorted(df["Orchestrator"].unique()))
print("Methods:", df["Method"].nunique(), METHOD_ORDER)

# 2) Counts by method (should match your paper tables)
method_counts = df["Method"].value_counts().reindex(METHOD_ORDER)
print("\n[2] Rows per Method")
print(method_counts.to_string())

# 3) Show the “ideal” factorial size under your stated design assumptions
#    - standard LLMs apply to Direct (Non-Reasoning) + Prompt2DAG methods
#    - reasoning LLMs apply only to Direct (Reasoning)
std_llms = sorted(df[df["Method"] != "Direct (Reasoning)"]["LLM_for_Workflow"].unique())
rsn_llms = sorted(df[df["Method"] == "Direct (Reasoning)"]["LLM_for_Workflow"].unique())

n_pipes = df["Pipeline_ID"].nunique()
n_orch = df["Orchestrator"].nunique()
n_reps_assumed = 3  # your stated repetition count

n_methods_std = 4  # Direct(NR) + 3 Prompt2DAG variants
n_methods_rsn = 1  # Direct(Reasoning)

ideal_total = (len(std_llms) * n_methods_std * n_orch * n_pipes * n_reps_assumed) + \
              (len(rsn_llms) * n_methods_rsn * n_orch * n_pipes * n_reps_assumed)

print("\n[3] Factorial grid (under stated assumptions)")
print(f"Standard LLMs observed: {len(std_llms)}")
print(f"Reasoning LLMs observed: {len(rsn_llms)}")
print(f"Ideal total runs if fully populated = {ideal_total:,}")
print(f"Observed total runs                 = {len(df):,}")
print(f"Missing vs ideal                    = {ideal_total - len(df):,}")

# 4) Coverage completeness by (Method × LLM): expected = 3 orch × 38 pipes × 3 reps = 342 per LLM if complete
#    We compute expected from the global pipeline×orchestrator set to avoid assuming 38/3 if filters exist.
pipe_orch = df[["Pipeline_ID", "Orchestrator"]].drop_duplicates()
expected_pipe_orch = len(pipe_orch)  # typically 38*3 = 114
expected_per_llm = expected_pipe_orch * n_reps_assumed

print("\n[4] Coverage by Method × LLM (expected rows per LLM if complete = "
      f"{expected_pipe_orch} pipe×orch × {n_reps_assumed} reps = {expected_per_llm})")

mx = (
    df.groupby(["Method", "LLM_for_Workflow"])
      .size()
      .reset_index(name="N")
)
mx["Expected_N"] = np.where(
    mx["Method"] == "Direct (Reasoning)",
    expected_per_llm,   # applies to reasoning llms only; still same expected per LLM
    expected_per_llm
)
mx["Missing_rows"] = mx["Expected_N"] - mx["N"]

# show largest deficits first
mx_sorted = mx.sort_values(["Method", "Missing_rows"], ascending=[True, False])
print(mx_sorted.to_string(index=False))

# 5) Coverage by Method × Orchestrator (to see if missingness is orchestrator-skewed)
print("\n[5] Coverage by Method × Orchestrator")
mo = (
    df.groupby(["Method", "Orchestrator"])
      .size()
      .reset_index(name="N")
      .sort_values(["Method", "Orchestrator"])
)
print(mo.to_string(index=False))

# 6) Repeat count distribution per configuration cell (Pipeline × Orchestrator × Method × LLM)
cell = ["Pipeline_ID", "Orchestrator", "Method", "LLM_for_Workflow"]
cell_counts = df.groupby(cell).size().reset_index(name="n_repeats_observed")

print("\n[6] Repeat count distribution per (pipeline, orch, method, llm) cell")
rep_dist = cell_counts["n_repeats_observed"].value_counts().sort_index()
print(rep_dist.to_string())

# 7) Identify incomplete cells (less than 3 repeats)
incomplete = cell_counts[cell_counts["n_repeats_observed"] < n_reps_assumed].copy()
print("\n[7] Incomplete cells (<3 repeats):", len(incomplete))
if len(incomplete) > 0:
    # show a small sample and aggregate counts by method/LLM to diagnose
    print("\nSample incomplete cells:")
    print(incomplete.head(20).to_string(index=False))

    print("\nIncomplete-cell counts by Method:")
    print(incomplete["Method"].value_counts().reindex(METHOD_ORDER).fillna(0).astype(int).to_string())

    print("\nIncomplete-cell counts by Method × LLM (top 20):")
    top_incomp = (
        incomplete.groupby(["Method", "LLM_for_Workflow"]).size().reset_index(name="n_cells")
        .sort_values("n_cells", ascending=False)
        .head(20)
    )
    print(top_incomp.to_string(index=False))

# 8) Identify missing pipe×orch pairs per (Method×LLM): i.e., cells with 0 runs (not present at all)
print("\n[8] Missing pipeline×orchestrator pairs per Method×LLM (cells with 0 runs)")
full_pairs = set(map(tuple, pipe_orch.values.tolist()))

missing_rows = []
for (m, llm), sub in df.groupby(["Method", "LLM_for_Workflow"]):
    present_pairs = set(map(tuple, sub[["Pipeline_ID", "Orchestrator"]].drop_duplicates().values.tolist()))
    missing_pairs = full_pairs - present_pairs
    if missing_pairs:
        missing_rows.append({
            "Method": m,
            "LLM_for_Workflow": llm,
            "Missing_pipe_orch_pairs": len(missing_pairs),
        })

missing_pairs_df = pd.DataFrame(missing_rows).sort_values(
    ["Missing_pipe_orch_pairs", "Method"], ascending=[False, True]
)
print(missing_pairs_df.to_string(index=False))

print("\nDone.")

RUN COUNT RECONCILIATION

[1] Total rows: 8742
Pipelines: 38
Orchestrators: 3 ['airflow', 'dagster', 'prefect']
Methods: 5 ['Direct (Non-Reasoning)', 'Prompt2DAG (Template)', 'Prompt2DAG (LLM)', 'Prompt2DAG (Hybrid)', 'Direct (Reasoning)']

[2] Rows per Method
Method
Direct (Non-Reasoning)    2394
Prompt2DAG (Template)     1578
Prompt2DAG (LLM)          2043
Prompt2DAG (Hybrid)       2043
Direct (Reasoning)         684

[3] Factorial grid (under stated assumptions)
Standard LLMs observed: 7
Reasoning LLMs observed: 2
Ideal total runs if fully populated = 10,260
Observed total runs                 = 8,742
Missing vs ideal                    = 1,518

[4] Coverage by Method × LLM (expected rows per LLM if complete = 114 pipe×orch × 3 reps = 342)
                Method                        LLM_for_Workflow   N  Expected_N  Missing_rows
Direct (Non-Reasoning)               deepinfra-claude-4-sonnet 342         342             0
Direct (Non-Reasoning)                   deepinfra-deepseek_a