In [2]:
# ============================================================
# KNN Classification With and Without LDA (FULL CODE)
# ============================================================

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)

# ------------------------------------------------------------
# 1. Load Dataset
# ------------------------------------------------------------
data = pd.read_csv("ML470_S9_Insurance_Data_Concept.csv")

# Feature columns
X = data[
    ["age", "children", "charges", "gender_n", "smoker_n", "region_n"]
]

# Target column (classification)
y = data["weight_condition_n"]

# ------------------------------------------------------------
# 2. Train-Test Split (70% â€“ 30%)
# ------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

# ------------------------------------------------------------
# 3. Feature Scaling
# ------------------------------------------------------------
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ------------------------------------------------------------
# 4. BASELINE KNN (Without LDA)
# ------------------------------------------------------------
error_rates = []

for k in range(1, 21):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    preds = knn.predict(X_test_scaled)
    error_rates.append(np.mean(preds != y_test))

best_k_baseline = error_rates.index(min(error_rates)) + 1

knn_baseline = KNeighborsClassifier(n_neighbors=best_k_baseline)
knn_baseline.fit(X_train_scaled, y_train)

y_pred_base = knn_baseline.predict(X_test_scaled)
y_prob_base = knn_baseline.predict_proba(X_test_scaled)

# ------------------------------------------------------------
# 5. Baseline Metrics
# ------------------------------------------------------------
baseline_metrics = {
    "Accuracy": accuracy_score(y_test, y_pred_base),
    "Precision": precision_score(y_test, y_pred_base, average="weighted"),
    "Recall": recall_score(y_test, y_pred_base, average="weighted"),
    "F1-Score": f1_score(y_test, y_pred_base, average="weighted"),
    "ROC-AUC": roc_auc_score(
        y_test, y_prob_base, multi_class="ovr"
    )
}

# ------------------------------------------------------------
# 6. APPLY LDA
# ------------------------------------------------------------
lda = LinearDiscriminantAnalysis()

X_train_lda = lda.fit_transform(X_train_scaled, y_train)
X_test_lda = lda.transform(X_test_scaled)

# ------------------------------------------------------------
# 7. KNN WITH LDA
# ------------------------------------------------------------
error_rates_lda = []

for k in range(1, 21):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_lda, y_train)
    preds = knn.predict(X_test_lda)
    error_rates_lda.append(np.mean(preds != y_test))

best_k_lda = error_rates_lda.index(min(error_rates_lda)) + 1

knn_lda = KNeighborsClassifier(n_neighbors=best_k_lda)
knn_lda.fit(X_train_lda, y_train)

y_pred_lda = knn_lda.predict(X_test_lda)
y_prob_lda = knn_lda.predict_proba(X_test_lda)

# ------------------------------------------------------------
# 8. LDA Model Metrics
# ------------------------------------------------------------
lda_metrics = {
    "Accuracy": accuracy_score(y_test, y_pred_lda),
    "Precision": precision_score(y_test, y_pred_lda, average="weighted"),
    "Recall": recall_score(y_test, y_pred_lda, average="weighted"),
    "F1-Score": f1_score(y_test, y_pred_lda, average="weighted"),
    "ROC-AUC": roc_auc_score(
        y_test, y_prob_lda, multi_class="ovr"
    )
}

# ------------------------------------------------------------
# 9. PERFORMANCE COMPARISON
# ------------------------------------------------------------
comparison = pd.DataFrame(
    [baseline_metrics, lda_metrics],
    index=["KNN Without LDA", "KNN With LDA"]
)

print("Best K (Without LDA):", best_k_baseline)
print("Best K (With LDA):", best_k_lda)
print("\nModel Performance Comparison:\n")
print(comparison)


Best K (Without LDA): 15
Best K (With LDA): 11

Model Performance Comparison:

                 Accuracy  Precision    Recall  F1-Score   ROC-AUC
KNN Without LDA  0.557214   0.496378  0.557214  0.497019  0.631513
KNN With LDA     0.567164   0.522873  0.567164  0.519402  0.621534


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
