In [None]:
# ===== Majority-vote model with "Displacement => normal if HIGH" =====
# HOW TO USE:
#  1) Run the cell. Upload the Cancer CSV first, then the Normal CSV.
#  2) If auto-detect picks wrong columns, set SPEED_COL / DISP_COL / ANGLE_COL below.

import io, json, numpy as np, pandas as pd
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold

# --------- (OPTIONAL) Hard-code your column names here ---------
SPEED_COL = None         # e.g. "mean_speed(px/s)"
DISP_COL  = None         # e.g. "total_displacement(px)"
ANGLE_COL = None         # e.g. "mean_turn_angle(deg)"
K_FOLDS   = 5

# --------- Utilities ---------
def autodetect(df, keywords, fallback=None):
    cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
    low  = {c: c.lower() for c in cols}
    for c in cols:
        if any(kw in low[c] for kw in keywords):
            return c
    return fallback or (cols[0] if cols else None)

def youden_threshold_poslabel(x, y, pos_label):
    """
    Learn the threshold that maximizes Youden's J for the class 'pos_label'.
    We always treat larger scores as 'more likely pos_label'.
    Returns: (threshold, AUC_for_pos_label, Jmax)
    """
    xs = pd.to_numeric(pd.Series(x), errors="coerce").to_numpy()
    ys = pd.Series(y).astype(int).to_numpy()
    m = np.isfinite(xs) & np.isfinite(ys)
    xs, ys = xs[m], ys[m]
    if len(np.unique(ys)) < 2 or len(np.unique(xs)) < 2:
        return None, np.nan, np.nan
    fpr, tpr, thr = roc_curve(ys, xs, pos_label=pos_label)
    J = tpr - fpr
    idx = int(np.nanargmax(J))
    if not np.isfinite(thr[idx]):  # choose best finite threshold if needed
        finite = np.where(np.isfinite(thr))[0]
        if len(finite) == 0:
            return None, np.nan, np.nan
        idx = int(finite[np.argmax(J[finite])])
    # AUC for pos_label: convert ys to 1 for pos_label
    z = (ys == pos_label).astype(int)
    auc = float(roc_auc_score(z, xs))
    return float(thr[idx]), auc, float(J[idx])

def predict_majority(df, cols, thr):
    """
    Voting rules:
      - Speed:        cancer vote if  Speed >= thr['Speed']
      - Displacement: cancer vote if  Displacement <  thr['Displacement']   (>= gives a NORMAL vote)
      - TurningAngle: cancer vote if  Angle >= thr['TurningAngle']
    """
    s_vote = (df[cols['Speed']]        >= thr['Speed']).astype(int)
    d_vote = (df[cols['Displacement']] <  thr['Displacement']).astype(int)  # flipped as requested
    a_vote = (df[cols['TurningAngle']] >= thr['TurningAngle']).astype(int)
    V = np.vstack([s_vote, d_vote, a_vote]).T
    yhat = (V.sum(axis=1) >= 2).astype(int)
    return yhat, V

def cross_validated_accuracy(df, y, cols, k=5, seed=42):
    y = y.astype(int).values
    n_pos, n_neg = int((y==1).sum()), int((y==0).sum())
    if min(n_pos, n_neg) < 2:
        return None
    n_splits = max(2, min(k, n_pos, n_neg))
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    accs = []
    for tr, va in skf.split(df, y):
        tr_df, va_df = df.iloc[tr], df.iloc[va]
        tr_y,  va_y  = y[tr], y[va]

        thr = {}
        # Learn thresholds on TRAIN with the requested orientations:
        thr['Speed'],        _, _ = youden_threshold_poslabel(tr_df[cols['Speed']],        tr_y, pos_label=1)  # cancer if high
        thr['Displacement'], _, _ = youden_threshold_poslabel(tr_df[cols['Displacement']], tr_y, pos_label=0)  # NORMAL if high
        thr['TurningAngle'], _, _ = youden_threshold_poslabel(tr_df[cols['TurningAngle']], tr_y, pos_label=1)  # cancer if high

        if any(v is None for v in thr.values()):
            continue

        yhat, _ = predict_majority(va_df, cols, thr)
        accs.append((yhat == va_y).mean())
    if not accs:
        return None
    return float(np.mean(accs)), float(np.std(accs)), int(n_splits)

# --------- Load data (Colab upload) ---------
try:
    from google.colab import files
    print("Upload the *Cancer* CSV now …")
    up_c = files.upload()
    print("Upload the *Normal* CSV now …")
    up_n = files.upload()
    def read_first(upload_dict):
        (fname, content), = upload_dict.items()
        data = content if isinstance(content, bytes) else content.read()
        return pd.read_csv(io.BytesIO(data))
    df_cancer = read_first(up_c).copy()
    df_normal = read_first(up_n).copy()
except Exception as e:
    raise SystemExit(f"Colab file upload failed or not running in Colab: {e}")

df_cancer["label"] = 1
df_normal["label"] = 0
df = pd.concat([df_cancer, df_normal], ignore_index=True)

# --------- Pick columns (auto or manual) ---------
if SPEED_COL is None:
    SPEED_COL = autodetect(df, ["speed", "v", "mean_speed"])
if DISP_COL is None:
    DISP_COL  = autodetect(df, ["disp", "displacement", "net"])
if ANGLE_COL is None:
    ANGLE_COL = autodetect(df, ["angle", "theta", "turn"])

cols = {"Speed": SPEED_COL, "Displacement": DISP_COL, "TurningAngle": ANGLE_COL}
if any(v is None for v in cols.values()):
    raise SystemExit("Could not auto-detect all three feature columns. Set SPEED_COL / DISP_COL / ANGLE_COL at the top.")

# keep only numeric versions of the selected columns
for c in cols.values():
    df[c] = pd.to_numeric(df[c], errors="coerce")
df = df.dropna(subset=list(cols.values()) + ["label"]).reset_index(drop=True)

# --------- Learn final thresholds on ALL rows with the requested orientations ---------
thr_final = {}
aucJ = {}  # {name: (AUC, J)}
thr_final['Speed'],        auc_s, J_s = youden_threshold_poslabel(df[cols['Speed']],        df["label"], pos_label=1)  # cancer if high
thr_final['Displacement'], auc_d, J_d = youden_threshold_poslabel(df[cols['Displacement']], df["label"], pos_label=0)  # NORMAL if high
thr_final['TurningAngle'], auc_a, J_a = youden_threshold_poslabel(df[cols['TurningAngle']], df["label"], pos_label=1)  # cancer if high
aucJ['Speed'] = (auc_s, J_s); aucJ['Displacement'] = (auc_d, J_d); aucJ['TurningAngle'] = (auc_a, J_a)

# --------- Cross-validated accuracy (fair estimate) ---------
cv = cross_validated_accuracy(df[list(cols.values())], df["label"], cols, k=K_FOLDS, seed=42)

# --------- Apparent accuracy with final thresholds (on all rows) ---------
yhat_all, _ = predict_majority(df, cols, thr_final)
acc_apparent = float((yhat_all == df["label"].values).mean())
cm = confusion_matrix(df["label"], yhat_all)

# --------- Print results ---------
print("\n=== Learned thresholds (use these in your app) ===")
print(f"Speed:        cancer if value >= {thr_final['Speed']:.4f}      (AUC={aucJ['Speed'][0]:.3f}, Youden J={aucJ['Speed'][1]:.3f})   column='{cols['Speed']}'")
print(f"Displacement: NORMAL if value >= {thr_final['Displacement']:.4f}  (AUC={aucJ['Displacement'][0]:.3f}, Youden J={aucJ['Displacement'][1]:.3f})   column='{cols['Displacement']}'")
print(f"TurningAngle: cancer if value >= {thr_final['TurningAngle']:.4f}  (AUC={aucJ['TurningAngle'][0]:.3f}, Youden J={aucJ['TurningAngle'][1]:.3f})   column='{cols['TurningAngle']}'")

if cv is not None:
    mean_acc, std_acc, n_splits = cv
    print(f"\nCross‑validated majority‑vote accuracy (Stratified {n_splits}-Fold): {100*mean_acc:.2f}%  ± {100*std_acc:.2f}%")
else:
    print("\nCross‑validated accuracy: not enough samples per class to run K‑fold CV.")
print(f"Apparent majority‑vote accuracy on all rows (fit+test on same data): {100*acc_apparent:.2f}%")

print("\nConfusion matrix (rows=true, cols=pred):")
print(pd.DataFrame(cm, index=['Normal(0)','Cancer(1)'], columns=['Normal(0)','Cancer(1)']))

# --------- Save thresholds for your app ---------
payload = {
    "columns": cols,
    # explicit rule text, so you can plug into your app:
    "rules": {
        "Speed":        {"cancer_if": ">=", "threshold": float(thr_final['Speed'])},
        "Displacement": {"normal_if": ">=", "threshold": float(thr_final['Displacement'])},  # flipped as requested
        "TurningAngle": {"cancer_if": ">=", "threshold": float(thr_final['TurningAngle'])},
    },
    "cv_accuracy_mean": float(cv[0]) if cv else None,
    "cv_accuracy_std": float(cv[1]) if cv else None,
    "apparent_accuracy": float(acc_apparent)
}
with open("thresholds.json", "w") as f:
    json.dump(payload, f, indent=2)
print("\nSaved thresholds.json with everything you need.")
try:
    from google.colab import files
    files.download("thresholds.json")
except Exception:
    pass


Upload the *Cancer* CSV now …


Saving cancer1_features.csv to cancer1_features (6).csv
Upload the *Normal* CSV now …


Saving normal_features.csv to normal_features (5).csv

=== Learned thresholds (use these in your app) ===
Speed:        cancer if value >= 46.5397      (AUC=0.820, Youden J=0.545)   column='mean_speed(px/s)'
Displacement: NORMAL if value >= 29.2460  (AUC=0.579, Youden J=0.175)   column='total_displacement(px)'
TurningAngle: cancer if value >= 74.8213  (AUC=0.711, Youden J=0.407)   column='mean_turn_angle(deg)'

Cross‑validated majority‑vote accuracy (Stratified 5-Fold): 85.10%  ± 3.49%
Apparent majority‑vote accuracy on all rows (fit+test on same data): 85.19%

Confusion matrix (rows=true, cols=pred):
           Normal(0)  Cancer(1)
Normal(0)         83         55
Cancer(1)        122        935

Saved thresholds.json with everything you need.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>