In [None]:
# Cell 1 - Installing required libraries for statistical analysis
!pip install scipy # For performing statistical operations
!pip install pandas # Stores data from analysis into tables
!pip install numpy # For numerical operations and array manipulation
!pip install statsmodels # For False Discovery rate (FDR) correction

In [None]:
# Cell 2 - Importing packages required for statistical analysis
import os # For interacting with the operating system
import numpy as np # For numerical operations and array manipulation
import pandas as pd # Stores data from analysis into tables
from scipy.stats import mannwhitneyu # To apply the Mann-Whitney U test
from statsmodels.stats.multitest import multipletests # For False Discovery rate (FDR) correction

In [None]:
# Cell 3 - Statistical analysis code
# -------- ROUT-like outlier removal --------
def rout_outlier_removal(data, Q=1):
    """
    Approximate ROUT outlier filtering using robust Z (median/MAD).
    Q controls stringency: 0.1 ~ lenient, 1 ~ default, 5 ~ stricter.
    """
    arr = np.asarray(data, dtype=float)
    arr = arr[np.isfinite(arr)]  # drop NaN/Inf
    if arr.size < 3:
        return arr
    median = np.median(arr)
    mad = np.median(np.abs(arr - median))
    if mad == 0:
        return arr
    robust_z = 0.6745 * (arr - median) / mad
    cutoff = {0.1: 4.5, 1: 3.5, 5: 2.8}.get(int(Q), 3.5)
    kept = arr[np.abs(robust_z) <= cutoff]
    return kept if kept.size > 0 else arr  # safeguard

def _star_code(p):
    if p < 1e-4: return "****"
    if p < 1e-3: return "***"
    if p < 1e-2: return "**"
    if p < 5e-2: return "*"
    return "ns"

# -------- Main statistical analysis --------
def statistical_analysis(input_csv, output_dir, Q=1):
    os.makedirs(output_dir, exist_ok=True)
    df = pd.read_csv(input_csv)

    # Ensure group column
    if "Group" in df.columns:
        group_col = "Group"
    elif "group" in df.columns:
        group_col = "group"
    else:
        raise ValueError("The input CSV must contain a 'Group' or 'group' column.")

    # Validate exactly 2 groups and map deterministically
    labels = df[group_col].dropna().unique()
    if len(labels) != 2:
        raise ValueError(f"Expected exactly 2 groups in '{group_col}', found: {labels}")

    group_map = {labels[0]: 0, labels[1]: 1}
    df["_group_code"] = df[group_col].map(group_map)
    g0_label, g1_label = labels[0], labels[1]

    # Numeric columns (exclude code)
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    feature_cols = [c for c in num_cols if c != "_group_code"]
    if not feature_cols:
        raise ValueError("No numeric feature columns found to analyze.")

    y = df["_group_code"].values
    rows = []

    # Mann - Whitney per feature (with ROUT filtering)
    for feat in feature_cols:
        x0 = df.loc[y == 0, feat].values
        x1 = df.loc[y == 1, feat].values
        x0_f = rout_outlier_removal(x0, Q=Q)
        x1_f = rout_outlier_removal(x1, Q=Q)
        if x0_f.size == 0 or x1_f.size == 0:
            continue

        stat, p = mannwhitneyu(x0_f, x1_f, alternative="two-sided")
        mean0, mean1 = float(np.mean(x0_f)), float(np.mean(x1_f))
        # Direction by medians (more in line with MWU); change to means if you prefer
        med0, med1 = float(np.median(x0_f)), float(np.median(x1_f))
        direction = f"↑ {g1_label}" if med1 > med0 else f"↓ {g1_label}"
        sig_raw = _star_code(p)

        rows.append([
            feat, stat, p,
            mean0, mean1,
            med0, med1,
            direction, sig_raw
        ])

    if not rows:
        print("No valid features survived ROUT filtering or insufficient data for tests.")
        return

    df_results = pd.DataFrame(rows, columns=[
        "Feature", "U_stat", "raw_p_value",
        f"Mean_{g0_label}", f"Mean_{g1_label}",
        f"Median_{g0_label}", f"Median_{g1_label}",
        "Direction", "Significance_raw"
    ])

    # ---------------- RAW CSV: ONLY MWU columns ----------------
    df_raw = df_results.sort_values("raw_p_value", na_position="last").reset_index(drop=True)
    raw_cols = [
        "Feature", "U_stat", "raw_p_value",
        f"Mean_{g0_label}", f"Mean_{g1_label}",
        f"Median_{g0_label}", f"Median_{g1_label}",
        "Direction", "Significance_raw"
    ]
    raw_path = os.path.join(output_dir, "mann_whitney_raw.csv")
    df_raw[raw_cols].to_csv(raw_path, index=False)

    # ---------------- FDR CSV: MWU + FDR columns ----------------
    df_fdr = df_raw.copy()
    df_fdr["adj_p_value"] = np.nan
    df_fdr["Significance_FDR"] = "ns"

    valid_mask = df_fdr["raw_p_value"].notna().values
    if valid_mask.any():
        _, pvals_corrected, _, _ = multipletests(
            df_fdr.loc[valid_mask, "raw_p_value"].values,
            method="fdr_bh"
        )
        df_fdr.loc[valid_mask, "adj_p_value"] = pvals_corrected
        df_fdr.loc[valid_mask, "Significance_FDR"] = [
            _star_code(pv) for pv in df_fdr.loc[valid_mask, "adj_p_value"].values
        ]

    df_fdr = df_fdr.sort_values("adj_p_value", na_position="last").reset_index(drop=True)
    fdr_path = os.path.join(output_dir, "mann_whitney_FDR.csv")
    df_fdr.to_csv(fdr_path, index=False)

    print("Saved:")
    print("  • RAW (MWU only):", raw_path)
    print("  • FDR (MWU + FDR):", fdr_path)
    print("\nTop (FDR-sorted):")
    print(df_fdr.head(15).to_string(index=False))

# -------- RUN --------
if __name__ == "__main__":
    # You can switch between your MFTA or Radiomics CSVs here:
    input_csv = "/Users/MFTA_results.csv" # <-- remove # if you want to analyse MFTA
    # input_csv = "/Users/Radiomics_results.csv" # <-- remove # if you want to analyse Radiomics
    
    output_dir = "/Users/Output" # Change to your output pathanme
    statistical_analysis(input_csv, output_dir, Q=1)