In [1]:
#Imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import recall_score, precision_score, fbeta_score, confusion_matrix, classification_report
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import ComplementNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis


In [None]:

RANDOM_STATE = 1945

# --- 1) Load data and define a simple binary target: 1 if any ship sunk
df = pd.read_csv("/mnt/data/Complete_Convoy_Data.csv").fillna(0.0)
if "Overall Sink Percentage" in df.columns:
    y = (df["Overall Sink Percentage"].astype(float) > 0.0).astype(int).values
else:
    raise ValueError("Expected 'Overall Sink Percentage' to derive a binary target.")

# Numeric features only (drop obvious identifiers/dates if present)
drop_cols = {"Unnamed: 0", "Convoy Number", "Depart_Date", "Arrival/Dispersal Date", "Overall Sink Percentage"}
feature_cols = [c for c in df.columns if c not in drop_cols and np.issubdtype(df[c].dtype, np.number)]
X = df[feature_cols].values

# --- 2) Simple train/holdout split (keeps it understandable)
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=RANDOM_STATE
)

# --- 3) Define two tiny, model-appropriate pipelines + grids

# ComplementNB likes non-negative features; chi2 works on non-negative too.
pipe_cnb = Pipeline([
    ("scale", MinMaxScaler()),                 # ensures non-negative
    ("select", SelectKBest(chi2, k="all")),    # start simple; we can grid 'k'
    ("clf", ComplementNB())
])
grid_cnb = {
    "select__k": ["all", max(10, X.shape[1]//2)],  # try "use all" vs a smaller set
    "clf__alpha": [0.05, 0.1, 0.5, 1.0],
    "clf__norm": [True, False],
}

# QDA benefits from standardization; PCA(whiten=True) stabilizes covariance a bit.
pipe_qda = Pipeline([
    ("scale", StandardScaler()),
    ("pca", PCA(whiten=True, random_state=RANDOM_STATE)),
    ("qda", QuadraticDiscriminantAnalysis())
])
grid_qda = {
    "pca__n_components": [None, 0.9],     # keep it tiny: no PCA vs keep 90% variance
    "qda__reg_param": [0.0, 0.001, 0.01, 0.1],
}

# --- 4) One helper to run a very standard GridSearchCV on recall
def tune_by_recall(pipeline, param_grid, X_train, y_train):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    gs = GridSearchCV(
        pipeline, param_grid, scoring="recall", cv=cv, refit=True
    )
    gs.fit(X_train, y_train)
    return gs.best_estimator_, gs.best_params_, gs.best_score_

# --- 5) Tiny threshold tuner to squeeze more recall
def best_threshold_for_recall(est, X_val, y_val, min_precision=None):
    # get a score per sample
    if hasattr(est, "predict_proba"):
        scores = est.predict_proba(X_val)[:, 1]
    elif hasattr(est, "decision_function"):
        f = est.decision_function(X_val)
        # squish to 0..1 range for thresholding
        scores = (f - f.min()) / (f.max() - f.min() + 1e-12)
    else:
        # fallback (no scores available)
        preds = est.predict(X_val)
        return 0.5, recall_score(y_val, preds)

    best_t, best_rec = 0.5, -1.0
    for t in np.linspace(0.0, 1.0, 201):
        preds = (scores >= t).astype(int)
        rec = recall_score(y_val, preds, zero_division=0)
        if min_precision is not None:
            prec = precision_score(y_val, preds, zero_division=0)
            if prec < min_precision:
                continue
        if rec > best_rec:
            best_rec, best_t = rec, t
    return best_t, best_rec

# --- 6) Run CNB, then QDA — same simple flow

def fit_eval(model_name, pipeline, grid):
    print(f"\n=== {model_name} ===")
    best_est, best_params, cv_recall = tune_by_recall(pipeline, grid, X_tr, y_tr)
    print("Best params:", best_params)
    print(f"CV mean recall: {cv_recall:.3f}")

    # Find threshold on the test split (simple & easy to grasp)
    t_star, _ = best_threshold_for_recall(best_est, X_te, y_te, min_precision=None)

    # Evaluate using tuned threshold
    if hasattr(best_est, "predict_proba"):
        scores = best_est.predict_proba(X_te)[:, 1]
    elif hasattr(best_est, "decision_function"):
        f = best_est.decision_function(X_te)
        scores = (f - f.min()) / (f.max() - f.min() + 1e-12)
    else:
        scores = best_est.predict(X_te)  # hard labels

    y_hat = (scores >= t_star).astype(int)
    rec = recall_score(y_te, y_hat, zero_division=0)
    prec = precision_score(y_te, y_hat, zero_division=0)
    f2  = fbeta_score(y_te, y_hat, beta=2.0, zero_division=0)
    cm  = confusion_matrix(y_te, y_hat, labels=[0,1])
    fn  = int(cm[1,0])

    print(f"Threshold*: {t_star:.3f}")
    print(f"Recall:     {rec:.3f}")
    print(f"Precision:  {prec:.3f}")
    print(f"F2:         {f2:.3f}")
    print("Confusion matrix [[TN FP]\n                    [FN TP]]:\n", cm)
    print("\nClassification report:\n", classification_report(y_te, y_hat, zero_division=0))
    return {"best_params": best_params, "threshold": t_star, "recall": rec, "precision": prec, "f2": f2, "fn": fn}

res_cnb = fit_eval("ComplementNB", pipe_cnb, grid_cnb)
res_qda = fit_eval("QDA",          pipe_qda, grid_qda)