In [None]:
# ✅ STEP 1: Download and run notebook from GitHub (no Google Drive required)

import requests
import nbformat
from IPython import get_ipython

def run_notebook_from_github(url):
    """
    Downloads and executes a Jupyter notebook from a GitHub raw URL.
    
    Parameters:
    url (str): Raw GitHub URL to a .ipynb notebook file
    """
    print("Downloading notebook from GitHub...")
    try:
        response = requests.get(url)
        response.raise_for_status()
    except Exception as e:
        print(f"Failed to download notebook: {e}")
        return

    print("📖 Parsing notebook content...")
    try:
        notebook = nbformat.reads(response.text, as_version=4)
    except Exception as e:
        print(f"Failed to parse notebook: {e}")
        return

    ipython = get_ipython()
    print("Running notebook cells...\n")

    for i, cell in enumerate(notebook.cells):
        if cell.cell_type == 'code':
            try:
                print(f"▶  Executing cell [{i + 1}]...")
                ipython.run_cell(cell.source)
            except Exception as e:
                print(f" Error in cell [{i + 1}]: {e}")

    print("\n All executable cells have been processed.")

# 🔗 Use your GitHub notebook URL
notebook_url = "https://raw.githubusercontent.com/Hushpuppyzac/DLI-Assignment/main/CleanedData.ipynb"

# ▶️ Run it
run_notebook_from_github(notebook_url)


In [None]:
# ==========================================# ✅ STEP 2 — Single Algorithm WITH Epochs# Algorithm: Logistic Regression via SGD (epochs with partial_fit)# Assumes Step 1 created: X_train_scaled, X_test_scaled, y_train_bal (or y_train_balanced), y_test# ==========================================import numpy as np, matplotlib.pyplot as plt, randomfrom sklearn.linear_model import SGDClassifierfrom sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,                             roc_auc_score, confusion_matrix, roc_curve, precision_recall_curve,                             classification_report, log_loss)np.random.seed(42); random.seed(42)# ---- verify/alias preprocessed arrays ----needed = ["X_train_scaled", "X_test_scaled", "y_test"]missing = [n for n in needed if n not in globals()]if "y_train_bal" not in globals() and "y_train_balanced" in globals():    y_train_bal = globals()["y_train_balanced"]if "y_train_bal" not in globals():    missing.append("y_train_bal (or y_train_balanced)")if missing:    raise NameError(f"❌ Preprocessed arrays not found: {missing}. Run CleanedData.ipynb first.")# ---- ensure numpy arrays & int labels ----def to_np(a):    try:        return a.values if hasattr(a, "values") else np.asarray(a)    except Exception:        return np.asarray(a)X_tr = to_np(X_train_scaled)X_te = to_np(X_test_scaled)y_tr = to_np(y_train_bal).astype(int)y_te = to_np(y_test).astype(int)print("✅ Data ready for SGD Logistic:")print("  X_train:", X_tr.shape, " X_test:", X_te.shape)print("  y_train:", y_tr.shape, " (counts:", np.bincount(y_tr).tolist(), ")")print("  y_test :", y_te.shape,  " (counts:", np.bincount(y_te).tolist(), ")")# ==========================================# 🔁 Train Logistic Regression (SGD) with EPOCHS# ==========================================EPOCHS     = 20        # adjust as neededBATCH_SIZE = 2048      # mini-batch sizeCLASSES    = np.array([0, 1])sgd = SGDClassifier(    loss="log_loss",     # logistic regression with probabilities    penalty="l2",    alpha=1e-4,    learning_rate="optimal",    random_state=42)n = X_tr.shape[0]idx = np.arange(n)train_acc_hist, val_acc_hist = [], []train_loss_hist, val_loss_hist = [], []from scipy.special import expit  # for probability from decision_function if neededfor epoch in range(1, EPOCHS + 1):    np.random.shuffle(idx)    first = True    for start in range(0, n, BATCH_SIZE):        end = min(start + BATCH_SIZE, n)        b = idx[start:end]        Xb, yb = X_tr[b], y_tr[b]        if first:            sgd.partial_fit(Xb, yb, classes=CLASSES)            first = False        else:            sgd.partial_fit(Xb, yb)    # end-of-epoch metrics    if hasattr(sgd, "predict_proba"):        tr_prob = sgd.predict_proba(X_tr)[:,1]        va_prob = sgd.predict_proba(X_te)[:,1]    else:        tr_prob = expit(sgd.decision_function(X_tr))        va_prob = expit(sgd.decision_function(X_te))    tr_pred = (tr_prob >= 0.5).astype(int)    va_pred = (va_prob >= 0.5).astype(int)    tr_acc = accuracy_score(y_tr, tr_pred)    va_acc = accuracy_score(y_te, va_pred)    tr_loss = log_loss(y_tr, tr_prob, labels=[0,1])    va_loss = log_loss(y_te, va_prob, labels=[0,1])    train_acc_hist.append(tr_acc); val_acc_hist.append(va_acc)    train_loss_hist.append(tr_loss); val_loss_hist.append(va_loss)    print(f"Epoch {epoch:02d}/{EPOCHS}  train_acc: {tr_acc:.4f}  val_acc: {va_acc:.4f}  train_loss: {tr_loss:.4f}  val_loss: {va_loss:.4f}")# ==========================================# ✅ Final Evaluation# ==========================================y_prob = va_proby_pred = (y_prob >= 0.5).astype(int)print("\n" + "="*60)print("     SGD Logistic Regression — FINAL EVALUATION      ")print("="*60)print("📊 Confusion Matrix:")print(confusion_matrix(y_te, y_pred))print("\n📈 Classification Report:")print(classification_report(y_te, y_pred, digits=6))print(f"🎯 Accuracy : {accuracy_score(y_te, y_pred):.6f}")print(f"🎯 Precision: {precision_score(y_te, y_pred):.6f}")print(f"🎯 Recall   : {recall_score(y_te, y_pred):.6f}")print(f"🎯 F1 Score : {f1_score(y_te, y_pred):.6f}")print(f"🎯 ROC AUC  : {roc_auc_score(y_te, y_prob):.6f}")print("="*60 + "\n")# ==========================================# 📈 Learning Curves + ROC/PR# ==========================================plt.figure(figsize=(12,5))plt.subplot(1,2,1)plt.plot(train_acc_hist, label="Train Acc")plt.plot(val_acc_hist,   label="Val Acc")plt.title("Accuracy over Epochs — SGD Logistic")plt.xlabel("Epoch"); plt.ylabel("Accuracy"); plt.legend()plt.subplot(1,2,2)plt.plot(train_loss_hist, label="Train Loss")plt.plot(val_loss_hist,   label="Val Loss")plt.title("Log Loss over Epochs — SGD Logistic")plt.xlabel("Epoch"); plt.ylabel("Log Loss"); plt.legend()plt.tight_layout(); plt.show()plt.figure(figsize=(14,5))plt.subplot(1,2,1)fpr, tpr, _ = roc_curve(y_te, y_prob)plt.plot(fpr, tpr, label=f"SGD Logistic (AUC={roc_auc_score(y_te, y_prob):.6f})")plt.plot([0,1],[0,1],'k--')plt.title("ROC Curve — SGD Logistic"); plt.xlabel("FPR"); plt.ylabel("TPR"); plt.legend()plt.subplot(1,2,2)prec, rec, _ = precision_recall_curve(y_te, y_prob)plt.plot(rec, prec, label="SGD Logistic")plt.title("Precision–Recall — SGD Logistic"); plt.xlabel("Recall"); plt.ylabel("Precision"); plt.legend()plt.tight_layout(); plt.show()