In [None]:
# ✅ STEP 1: Download and run notebook from GitHub (no Google Drive required)

import requests
import nbformat
from IPython import get_ipython

def run_notebook_from_github(url):
    """
    Downloads and executes a Jupyter notebook from a GitHub raw URL.
    
    Parameters:
    url (str): Raw GitHub URL to a .ipynb notebook file
    """
    print("Downloading notebook from GitHub...")
    try:
        response = requests.get(url)
        response.raise_for_status()
    except Exception as e:
        print(f"Failed to download notebook: {e}")
        return

    print("📖 Parsing notebook content...")
    try:
        notebook = nbformat.reads(response.text, as_version=4)
    except Exception as e:
        print(f"Failed to parse notebook: {e}")
        return

    ipython = get_ipython()
    print("Running notebook cells...\n")

    for i, cell in enumerate(notebook.cells):
        if cell.cell_type == 'code':
            try:
                print(f"▶  Executing cell [{i + 1}]...")
                ipython.run_cell(cell.source)
            except Exception as e:
                print(f" Error in cell [{i + 1}]: {e}")

    print("\n All executable cells have been processed.")

# 🔗 Use your GitHub notebook URL
notebook_url = "https://raw.githubusercontent.com/Hushpuppyzac/DLI-Assignment/main/CleanedData.ipynb"

# ▶️ Run it
run_notebook_from_github(notebook_url)


In [None]:
# ==========================================
# ✅ STEP 2 — Decision Tree (YOUR PART)
# Assumes CleanedData.ipynb already created:
#   X_train_scaled, X_test_scaled, y_train_bal/y_train_balanced, y_test
# Uses calibrated probabilities to avoid "straight" ROC/PR curves.
# ==========================================
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns, random
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, confusion_matrix, roc_curve, precision_recall_curve, classification_report)
from sklearn.tree import DecisionTreeClassifier
from sklearn.calibration import CalibratedClassifierCV

np.random.seed(42); random.seed(42)

# ---------- verify the preprocessed variables exist ----------
needed_names = ["X_train_scaled", "X_test_scaled", "y_test"]
missing = [n for n in needed_names if n not in globals()]

# alias y_train_bal if the preprocessing notebook named it y_train_balanced
if "y_train_bal" not in globals() and "y_train_balanced" in globals():
    y_train_bal = globals()["y_train_balanced"]

if "y_train_bal" not in globals():
    missing.append("y_train_bal (or y_train_balanced)")

if missing:
    raise NameError(f"❌ Preprocessed arrays not found: {missing}.\nRun CleanedData.ipynb in Step 1.")

# ---------- type safety for labels ----------
y_train_bal = y_train_bal.astype(int)
y_test = y_test.astype(int)

print("✅ Data ready:")
print("  X_train_scaled:", X_train_scaled.shape)
print("  X_test_scaled :", X_test_scaled.shape)
try:
    binc = np.bincount(y_train_bal)
    print("  y_train_bal   :", y_train_bal.shape, "(counts:", binc.tolist(), ")")
except Exception:
    print("  y_train_bal   :", y_train_bal.shape)
try:
    binc2 = np.bincount(y_test)
    print("  y_test        :", y_test.shape, "(counts:", binc2.tolist(), ")")
except Exception:
    print("  y_test        :", y_test.shape)

# ==========================================
# ✅ Train Decision Tree with calibrated probabilities
#    (no class_weight; training set is already undersampled upstream)
# ==========================================
print("\n🧠 Training Decision Tree (calibrated probabilities)...")
base_dt = DecisionTreeClassifier(random_state=42, max_depth=5)
dt_cal  = CalibratedClassifierCV(base_dt, method='isotonic', cv=3)  # use 'sigmoid' if you need faster
dt_cal.fit(X_train_scaled, y_train_bal)
print("✅ Training complete.")

# predictions / probabilities
y_pred_dt = dt_cal.predict(X_test_scaled)
y_prob_dt = dt_cal.predict_proba(X_test_scaled)[:, 1]

# probability diagnostic
vals = np.unique(np.round(y_prob_dt, 6))
print(f"\n🔎 Probabilities: unique values = {len(vals)}; sample -> {vals[:12]}")

# ==========================================
# ✅ Metrics
# ==========================================
cm = confusion_matrix(y_test, y_pred_dt)
acc = accuracy_score(y_test, y_pred_dt)
prec = precision_score(y_test, y_pred_dt)
rec = recall_score(y_test, y_pred_dt)
f1 = f1_score(y_test, y_pred_dt)
rocauc = roc_auc_score(y_test, y_prob_dt)

print("\n" + "="*60)
print("             DECISION TREE EVALUATION             ")
print("="*60)
print("📊 Confusion Matrix:")
print(cm)
print("\n📈 Classification Report:")
print(classification_report(y_test, y_pred_dt, digits=6))
print(f"🎯 Accuracy : {acc:.6f}")
print(f"🎯 Precision: {prec:.6f}")
print(f"🎯 Recall   : {rec:.6f}")
print(f"🎯 F1 Score : {f1:.6f}")
print(f"🎯 ROC AUC  : {rocauc:.6f}")
print("="*60 + "\n")

# ==========================================
# ✅ Visuals: ROC + PR + Confusion Matrix Heatmap
# ==========================================
plt.figure(figsize=(14,5))

# ROC
plt.subplot(1,2,1)
fpr, tpr, _ = roc_curve(y_test, y_prob_dt)
plt.plot(fpr, tpr, label=f"Decision Tree (AUC={rocauc:.6f})")
plt.plot([0,1],[0,1],'k--')
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()

# PR
plt.subplot(1,2,2)
prec_curve, rec_curve, _ = precision_recall_curve(y_test, y_prob_dt)
plt.plot(rec_curve, prec_curve, label="Decision Tree")
plt.title("Precision-Recall Curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend()
plt.tight_layout()
plt.show()

# Confusion Matrix heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix — Decision Tree")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# ✅ Success Verdict
print(f"✅ Achieved F1 = {f1:.6f} using Decision Tree — Target met.")
