<a href="https://colab.research.google.com/github/Girisha-Malni-builds01/Girisha-AI-Practice-Notebook/blob/Supervised-AI-Algorithms/KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# @title Setup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, log_loss, roc_auc_score
from sklearn.datasets import fetch_california_housing, make_classification
from sklearn.inspection import PartialDependenceDisplay
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier

np.random.seed(42)

def plot_learning_curve(xs, train_metric, val_metric, xlabel, title):
    plt.figure()
    plt.plot(xs, train_metric, marker="o", label="train")
    plt.plot(xs, val_metric, marker="o", label="valid")
    plt.xlabel(xlabel); plt.ylabel("metric"); plt.title(title); plt.legend(); plt.grid(True)
    plt.show()

def heatmap(Z, xticks, yticks, xlabel, ylabel, title):
    plt.figure()
    im = plt.imshow(Z, aspect="auto", origin="lower")
    plt.colorbar(im)
    plt.xticks(ticks=np.arange(len(xticks)), labels=[str(x) for x in xticks], rotation=0)
    plt.yticks(ticks=np.arange(len(yticks)), labels=[str(y) for y in yticks])
    plt.xlabel(xlabel); plt.ylabel(ylabel); plt.title(title)
    plt.show()

def describe_split(X, y, name=""):
    n = len(y)
    pos = np.mean(y) if set(np.unique(y)) <= {0,1} else None
    print(f"{name} shape={X.shape}, n={n}", (f", positive_rate={pos:.3f}" if pos is not None else ""))


In [2]:
# @title Regression data
data = fetch_california_housing(as_frame=True)
X_reg = data.data
y_reg = data.target  # median house value (in 100k USD)

Xtr, Xte, ytr, yte = train_test_split(X_reg, y_reg, test_size=0.25, random_state=42)
describe_split(Xtr, ytr, "Train (reg)")
describe_split(Xte, yte, "Test (reg)")


Train (reg) shape=(15480, 8), n=15480 
Test (reg) shape=(5160, 8), n=5160 


In [3]:
# @title Good baseline regressor
gbr_good = GradientBoostingRegressor(
    loss="squared_error",
    learning_rate=0.05,   # small ν
    n_estimators=1200,    # more trees
    max_depth=3,          # shallow trees
    subsample=0.7,        # stochastic gradient boosting
    random_state=42
)
gbr_good.fit(Xtr, ytr)

pred_tr = gbr_good.predict(Xtr)
pred_te = gbr_good.predict(Xte)

rmse_tr = mean_squared_error(ytr, pred_tr, squared=False)
rmse_te = mean_squared_error(yte, pred_te, squared=False)
r2_tr   = r2_score(ytr, pred_tr)
r2_te   = r2_score(yte, pred_te)

print(f"Good baseline → RMSE train={rmse_tr:.3f}, test={rmse_te:.3f} | R2 train={r2_tr:.3f}, test={r2_te:.3f}")


TypeError: got an unexpected keyword argument 'squared'

In [4]:
# @title Bad hyperparameters to show overfitting
gbr_bad = GradientBoostingRegressor(
    learning_rate=0.5,    # too large
    n_estimators=100,     # too few
    max_depth=8,          # too deep
    subsample=1.0,        # no stochasticity
    random_state=42
)
gbr_bad.fit(Xtr, ytr)

pred_tr_b = gbr_bad.predict(Xtr)
pred_te_b = gbr_bad.predict(Xte)

rmse_tr_b = mean_squared_error(ytr, pred_tr_b, squared=False)
rmse_te_b = mean_squared_error(yte, pred_te_b, squared=False)
r2_tr_b   = r2_score(ytr, pred_tr_b)
r2_te_b   = r2_score(yte, pred_te_b)

print(f"Bad setup → RMSE train={rmse_tr_b:.3f}, test={rmse_te_b:.3f} | R2 train={r2_tr_b:.3f}, test={r2_te_b:.3f}")


KeyboardInterrupt: 

In [None]:
# @title Parallel experiment: heatmap (lower RMSE is better)
lrs = [0.01, 0.05, 0.1, 0.2]
Ns  = [200, 400, 800, 1200]
Z = np.zeros((len(lrs), len(Ns)))

for i, lr in enumerate(lrs):
    for j, N in enumerate(Ns):
        mdl = GradientBoostingRegressor(
            learning_rate=lr, n_estimators=N, max_depth=3, subsample=0.7, random_state=42
        ).fit(Xtr, ytr)
        Z[i, j] = mean_squared_error(yte, mdl.predict(Xte), squared=False)

heatmap(Z, Ns, lrs, xlabel="n_estimators", ylabel="learning_rate", title="Test RMSE heatmap (regression)")


In [None]:
# @title Learning curves over boosting rounds
gbr_curve = GradientBoostingRegressor(
    learning_rate=0.05, n_estimators=1500, max_depth=3, subsample=0.7, random_state=42
).fit(Xtr, ytr)

train_rmse, val_rmse = [], []
for yhat_tr, yhat_te in zip(gbr_curve.staged_predict(Xtr), gbr_curve.staged_predict(Xte)):
    train_rmse.append(mean_squared_error(ytr, yhat_tr, squared=False))
    val_rmse.append(mean_squared_error(yte, yhat_te, squared=False))

plot_learning_curve(
    np.arange(1, len(train_rmse)+1),
    train_rmse, val_rmse,
    xlabel="n_estimators", title="RMSE vs. boosting rounds (regression)"
)

best_round = int(np.argmin(val_rmse) + 1)
print("Early stopping suggestion → best n_estimators:", best_round)


In [None]:
# @title Feature importance and PDPs
imp = pd.Series(gbr_good.feature_importances_, index=X_reg.columns).sort_values(ascending=False)
print(imp.head(10))

plt.figure(); imp.plot(kind="bar"); plt.title("Feature importance (GBR)"); plt.tight_layout(); plt.show()

features_to_plot = [imp.index[0], imp.index[1]]
PartialDependenceDisplay.from_estimator(gbr_good, X_reg, features_to_plot)
plt.show()


In [None]:
# @title Classification data
X_cls, y_cls = make_classification(
    n_samples=20000, n_features=20, n_informative=8, n_redundant=4,
    class_sep=1.2, weights=[0.6, 0.4], random_state=42
)
Xtrc, Xtec, ytrc, ytec = train_test_split(X_cls, y_cls, test_size=0.25, random_state=42, stratify=y_cls)
describe_split(Xtrc, ytrc, "Train (cls)")
describe_split(Xtec, ytec, "Test (cls)")


In [None]:
# @title Good baseline classifier
gbc_good = GradientBoostingClassifier(
    learning_rate=0.05, n_estimators=800, max_depth=3, subsample=0.7, random_state=42
).fit(Xtrc, ytrc)

proba_tr = gbc_good.predict_proba(Xtrc)[:,1]
proba_te = gbc_good.predict_proba(Xtec)[:,1]

acc_tr = accuracy_score(ytrc, (proba_tr>0.5).astype(int))
acc_te = accuracy_score(ytec, (proba_te>0.5).astype(int))
ll_tr  = log_loss(ytrc, proba_tr)
ll_te  = log_loss(ytec, proba_te)
auc_tr = roc_auc_score(ytrc, proba_tr)
auc_te = roc_auc_score(ytec, proba_te)

print(f"Good baseline → acc train={acc_tr:.3f}, test={acc_te:.3f} | logloss train={ll_tr:.3f}, test={ll_te:.3f} | AUC train={auc_tr:.3f}, test={auc_te:.3f}")


In [None]:
# @title Parallel experiment: heatmap (lower logloss is better)
lrs = [0.01, 0.05, 0.1, 0.2]
Ns  = [200, 400, 800, 1200]
Z = np.zeros((len(lrs), len(Ns)))

for i, lr in enumerate(lrs):
    for j, N in enumerate(Ns):
        mdl = GradientBoostingClassifier(
            learning_rate=lr, n_estimators=N, max_depth=3, subsample=0.7, random_state=42
        ).fit(Xtrc, ytrc)
        Z[i, j] = log_loss(ytec, mdl.predict_proba(Xtec)[:,1])

heatmap(Z, Ns, lrs, xlabel="n_estimators", ylabel="learning_rate", title="Test log loss heatmap (classification)")


In [None]:
# @title Learning curves over boosting rounds (classification)
gbc_curve = GradientBoostingClassifier(
    learning_rate=0.05, n_estimators=1500, max_depth=3, subsample=0.7, random_state=42
).fit(Xtrc, ytrc)

train_ll, val_ll = [], []
train_auc, val_auc = [], []

for proba_tr_i, proba_te_i in zip(gbc_curve.staged_predict_proba(Xtrc), gbc_curve.staged_predict_proba(Xtec)):
    p_tr = proba_tr_i[:,1]; p_te = proba_te_i[:,1]
    train_ll.append(log_loss(ytrc, p_tr))
    val_ll.append(log_loss(ytec, p_te))
    train_auc.append(roc_auc_score(ytrc, p_tr))
    val_auc.append(roc_auc_score(ytec, p_te))

plot_learning_curve(np.arange(1, len(train_ll)+1), train_ll, val_ll, "n_estimators", "Log loss vs. boosting rounds (cls)")
plot_learning_curve(np.arange(1, len(train_auc)+1), train_auc, val_auc, "n_estimators", "AUC vs. boosting rounds (cls)")

best_round_ll = int(np.argmin(val_ll) + 1)
print("Early stopping suggestion (log loss) → best n_estimators:", best_round_ll)


In [None]:
# @title Feature importance & PDPs (classification)
imp_c = pd.Series(gbc_good.feature_importances_, index=[f"f{i}" for i in range(X_cls.shape[1])]).sort_values(ascending=False)
print(imp_c.head(10))

plt.figure(); imp_c.plot(kind="bar"); plt.title("Feature importance (GBC)"); plt.tight_layout(); plt.show()

top2 = [int(imp_c.index[0][1:]), int(imp_c.index[1][1:])]
PartialDependenceDisplay.from_estimator(gbc_good, X_cls, [top2[0], top2[1]])
plt.show()
