In [1]:
# === Cell 0: Imports, paths, load features ===
from pathlib import Path
import numpy as np
import pandas as pd
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    roc_auc_score, average_precision_score, confusion_matrix
)

import matplotlib.pyplot as plt

# ---- Paths (same BASE you used before) ----
BASE = Path(r"c:\Users\aryan\OneDrive\Desktop\SEM 3\DS 203\DS203-Wildlife-Detection-Project")
DATA = BASE / "data"
FEAT_DIR = DATA / "features"
MODELS_DIR = BASE / "models"
RESULTS_DIR = BASE / "results" / "models"
MODELS_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

# ---- Load features ----
X_train = np.load(FEAT_DIR / "X_train.npy")
y_train = np.load(FEAT_DIR / "y_train.npy")
X_val   = np.load(FEAT_DIR / "X_val.npy")
y_val   = np.load(FEAT_DIR / "y_val.npy")
X_test  = np.load(FEAT_DIR / "X_test.npy")
y_test  = np.load(FEAT_DIR / "y_test.npy")

X_train.shape, X_val.shape, X_test.shape, y_train.mean(), y_val.mean(), y_test.mean()


((24512, 3264),
 (1344, 3264),
 (1408, 3264),
 np.float64(0.26807278067885115),
 np.float64(0.2507440476190476),
 np.float64(0.2784090909090909))

In [2]:
# === Cell 1: Metrics helpers ===
def evaluate(y_true, y_pred, y_proba=None):
    acc = accuracy_score(y_true, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    out = {"acc": acc, "precision": prec, "recall": rec, "f1": f1}
    if y_proba is not None:
        try:
            roc = roc_auc_score(y_true, y_proba)
        except ValueError:
            roc = np.nan
        try:
            ap = average_precision_score(y_true, y_proba)
        except ValueError:
            ap = np.nan
        out.update({"roc_auc": roc, "pr_auc": ap})
    return out

def print_report(name, metrics):
    msg = (f"{name:>8s} | "
           f"Acc {metrics['acc']:.4f} | P {metrics['precision']:.4f} | "
           f"R {metrics['recall']:.4f} | F1 {metrics['f1']:.4f} | "
           f"ROC-AUC {metrics.get('roc_auc', np.nan):.4f} | PR-AUC {metrics.get('pr_auc', np.nan):.4f}")
    print(msg)

def plot_curves(y_true, y_proba, title_prefix):
    from sklearn.metrics import roc_curve, precision_recall_curve
    plt.figure()
    fpr, tpr, _ = roc_curve(y_true, y_proba)
    plt.plot(fpr, tpr, label="ROC")
    plt.plot([0,1],[0,1],'--')
    plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title(f"{title_prefix} - ROC")
    plt.tight_layout(); plt.show()

    plt.figure()
    p, r, _ = precision_recall_curve(y_true, y_proba)
    plt.plot(r, p, label="PR")
    plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title(f"{title_prefix} - PR")
    plt.tight_layout(); plt.show()


In [3]:
# === Cell 2: Logistic Regression (balanced) ===
logreg = Pipeline([
    ("scaler", StandardScaler(with_mean=True)),
    ("clf", LogisticRegression(
        max_iter=2000,
        class_weight="balanced",
        n_jobs=None,
        solver="lbfgs"
    ))
])

logreg.fit(X_train, y_train)

# Predict/Proba on VAL
val_pred_lr = logreg.predict(X_val)
# Proba: handle both binary and ovr shapes
if hasattr(logreg.named_steps["clf"], "predict_proba"):
    val_proba_lr = logreg.predict_proba(X_val)[:, 1]
else:
    # decision_function fallback
    dec = logreg.decision_function(X_val)
    # Min-max scale to [0,1] for AUCs
    val_proba_lr = (dec - dec.min()) / (dec.max() - dec.min() + 1e-9)

m_lr = evaluate(y_val, val_pred_lr, val_proba_lr)
print_report("LogReg", m_lr)


  LogReg | Acc 0.6868 | P 0.4129 | R 0.5905 | F1 0.4860 | ROC-AUC 0.6988 | PR-AUC 0.4677


In [4]:
# === Cell 3: RandomForest (balanced_subsample) ===
rf = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    n_jobs=-1,
    class_weight="balanced_subsample",
    random_state=42
)
rf.fit(X_train, y_train)

val_pred_rf  = rf.predict(X_val)
val_proba_rf = rf.predict_proba(X_val)[:, 1]
m_rf = evaluate(y_val, val_pred_rf, val_proba_rf)
print_report("RandFor", m_rf)


 RandFor | Acc 0.7708 | P 0.8537 | R 0.1039 | F1 0.1852 | ROC-AUC 0.8677 | PR-AUC 0.6631


In [5]:
# === Cell 4: Select best on VAL, evaluate on TEST, save ===
candidates = [
    ("logreg", logreg, m_lr, val_proba_lr),
    ("rf",     rf,     m_rf, val_proba_rf),
]

# Choose by F1 (you can switch to PR-AUC if you prefer)
best_name, best_model, best_m, best_val_proba = max(candidates, key=lambda x: x[2]["f1"])
print(f"\nBest on VAL: {best_name} -> {best_m}\n")

# Test evaluation
test_pred = best_model.predict(X_test)
if hasattr(best_model, "predict_proba"):
    test_proba = best_model.predict_proba(X_test)[:,1]
else:
    dec = best_model.decision_function(X_test)
    test_proba = (dec - dec.min())/(dec.max()-dec.min()+1e-9)

m_test = evaluate(y_test, test_pred, test_proba)
print_report("TEST", m_test)

# Save model + metrics
model_path = MODELS_DIR / "final_model.pkl"
with open(model_path, "wb") as f:
    pickle.dump(best_model, f)
print("Saved model to:", model_path)

# Save metrics as CSV
rows = []
for name, mdl, m, _ in candidates:
    rows.append({"model": name, **m, "split": "val"})
rows.append({"model": best_name, **m_test, "split": "test"})
pd.DataFrame(rows).to_csv(RESULTS_DIR / "metrics.csv", index=False)
print("Saved metrics to:", RESULTS_DIR / "metrics.csv")



Best on VAL: logreg -> {'acc': 0.6867559523809523, 'precision': 0.41286307053941906, 'recall': 0.5905044510385756, 'f1': 0.48595848595848595, 'roc_auc': 0.6988056895500045, 'pr_auc': 0.46769943574354994}

    TEST | Acc 0.6761 | P 0.4389 | R 0.5867 | F1 0.5022 | ROC-AUC 0.7104 | PR-AUC 0.5065
Saved model to: c:\Users\aryan\OneDrive\Desktop\SEM 3\DS 203\DS203-Wildlife-Detection-Project\models\final_model.pkl
Saved metrics to: c:\Users\aryan\OneDrive\Desktop\SEM 3\DS 203\DS203-Wildlife-Detection-Project\results\models\metrics.csv


In [6]:
# === Cell 6 (optional): Threshold sweep to tune F1 on VAL ===
def threshold_sweep(y_true, y_score, thr_list=None):
    if thr_list is None:
        thr_list = np.linspace(0.1, 0.9, 17)
    rows = []
    for t in thr_list:
        yp = (y_score >= t).astype(int)
        m = evaluate(y_true, yp, y_score)
        rows.append({"thr": t, **m})
    return pd.DataFrame(rows)

thr_df = threshold_sweep(y_val, best_val_proba)
thr_df.to_csv(RESULTS_DIR / "threshold_sweep_val.csv", index=False)
thr_df.sort_values("f1", ascending=False).head(5)


Unnamed: 0,thr,acc,precision,recall,f1,roc_auc,pr_auc
7,0.45,0.68006,0.410405,0.632047,0.497664,0.698806,0.467699
6,0.4,0.655506,0.38986,0.661721,0.490649,0.698806,0.467699
9,0.55,0.706845,0.43418,0.557864,0.488312,0.698806,0.467699
8,0.5,0.686756,0.412863,0.590504,0.485958,0.698806,0.467699
5,0.35,0.625,0.36725,0.68546,0.478261,0.698806,0.467699
