In [5]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
np.set_printoptions(suppress=True)


iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)



class GaussianNaiveBayesScratch:
    """
    Multiclass Gaussian Naïve Bayes implemented from scratch.
    Assumes features are conditionally independent and Gaussian per class.
    """
    def fit(self, X, y):
        self.classes_ = np.unique(y)
        n_features = X.shape[1]

        # Priors P(y)
        self.class_priors_ = {
            c: np.mean(y == c) for c in self.classes_
        }

        # Mean and variance per class & feature
        self.mean_ = {}
        self.var_ = {}
        eps = 1e-9  # numerical stability

        for c in self.classes_:
            X_c = X[y == c]
            # unbiased variance not required for NB; MLE uses ddof=0
            self.mean_[c] = X_c.mean(axis=0)
            self.var_[c]  = X_c.var(axis=0) + eps  # add epsilon to avoid zero

        return self

    def _joint_log_likelihood(self, X):
        """
        For each sample x and class c, compute:
        log P(y=c) + sum_j log N(x_j | mu_cj, var_cj)
        """
        jll = []
        for c in self.classes_:
            mu = self.mean_[c]
            var = self.var_[c]
            # log of Gaussian density per feature (vectorized)
            # -0.5*log(2*pi*var) - (x-mu)^2/(2*var)
            log_prob = -0.5 * (np.log(2.0 * np.pi * var) + ((X - mu) ** 2) / var)
            # sum over features + log prior
            total = log_prob.sum(axis=1) + np.log(self.class_priors_[c])
            jll.append(total.reshape(-1, 1))
        # shape: (n_samples, n_classes)
        return np.hstack(jll)

    def predict(self, X):
        jll = self._joint_log_likelihood(X)
        # choose class with highest joint log likelihood
        idx = np.argmax(jll, axis=1)
        return self.classes_[idx]

# Train & evaluate scratch model
gnb_scratch = GaussianNaiveBayesScratch().fit(X_train, y_train)
y_pred_scratch = gnb_scratch.predict(X_test)
acc_scratch = accuracy_score(y_test, y_pred_scratch)

print("=== 1(i) Gaussian Naïve Bayes (Scratch) ===")
print(f"Accuracy: {acc_scratch:.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_scratch))
print(classification_report(y_test, y_pred_scratch, target_names=iris.target_names))

=== 1(i) Gaussian Naïve Bayes (Scratch) ===
Accuracy: 0.9211
Confusion Matrix:
 [[12  0  0]
 [ 0 12  1]
 [ 0  2 11]]
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        12
  versicolor       0.86      0.92      0.89        13
   virginica       0.92      0.85      0.88        13

    accuracy                           0.92        38
   macro avg       0.92      0.92      0.92        38
weighted avg       0.92      0.92      0.92        38



In [4]:
!pip install scikit-learn


Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.3-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.2-cp312-cp312-win_amd64.whl (8.7 MB)
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   - -------------------------------------- 0.3/8.7 MB ? eta -:--:--
   -- ------------------------------------- 0.5/8.7 MB 1.7 MB/s eta 0:00:05
   --- ------------------------------------ 0.8/8.7 MB 1.7 MB/s eta 0:00:05
   ---- ----------------------------------- 1.0/8.7 MB 1.6 MB/s eta 0:00:05
   ------- -------------------------------- 1.6/8.7 MB 1.6 MB/s eta 0:00:05
   --------- ------------------------------ 2.1/8.

In [6]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred_lib = gnb.predict(X_test)
acc_lib = accuracy_score(y_test, y_pred_lib)

print("\n=== 1(ii) GaussianNB (scikit-learn) ===")
print(f"Accuracy: {acc_lib:.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lib))
print(classification_report(y_test, y_pred_lib, target_names=iris.target_names))


=== 1(ii) GaussianNB (scikit-learn) ===
Accuracy: 0.9211
Confusion Matrix:
 [[12  0  0]
 [ 0 12  1]
 [ 0  2 11]]
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        12
  versicolor       0.86      0.92      0.89        13
   virginica       0.92      0.85      0.88        13

    accuracy                           0.92        38
   macro avg       0.92      0.92      0.92        38
weighted avg       0.92      0.92      0.92        38



In [7]:
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier())
])

param_grid = {
    "knn__n_neighbors": list(range(1, 31)),   # try K = 1..30
    "knn__weights": ["uniform", "distance"],
    "knn__p": [1, 2]  # 1: Manhattan, 2: Euclidean
}

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="accuracy",
    cv=5,
    n_jobs=-1,
    refit=True
)
grid.fit(X_train, y_train)

print("\n=== 2. GridSearchCV for KNN ===")
print("Best Params:", grid.best_params_)
print(f"Best CV Accuracy: {grid.best_score_:.4f}")

# Evaluate best model on test set
best_knn = grid.best_estimator_
y_pred_knn = best_knn.predict(X_test)
acc_knn = accuracy_score(y_test, y_pred_knn)

print(f"Test Accuracy with tuned KNN: {acc_knn:.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn, target_names=iris.target_names))


=== 2. GridSearchCV for KNN ===
Best Params: {'knn__n_neighbors': 16, 'knn__p': 1, 'knn__weights': 'uniform'}
Best CV Accuracy: 0.9644
Test Accuracy with tuned KNN: 0.9474
Confusion Matrix:
 [[12  0  0]
 [ 0 12  1]
 [ 0  1 12]]
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        12
  versicolor       0.92      0.92      0.92        13
   virginica       0.92      0.92      0.92        13

    accuracy                           0.95        38
   macro avg       0.95      0.95      0.95        38
weighted avg       0.95      0.95      0.95        38

