## Naive Bayes on Diabetes dataset

This notebook fits a Naive Bayes classifier built from scratch and compares it with `sklearn` on the diabetes dataset.


In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB

# Load the diabetes dataset
df = pd.read_csv("data/diabetes (1).csv")

# Use all columns except the last as features and the last as the target
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape


In [None]:
class GaussianNaiveBayesScratch:
    """Very small Gaussian Naive Bayes implementation for continuous features."""

    def fit(self, X, y):
        X = np.asarray(X, dtype=float)
        y = np.asarray(y)

        self.classes_, counts = np.unique(y, return_counts=True)
        self.class_priors_ = counts / len(y)

        self.means_ = np.zeros((len(self.classes_), X.shape[1]), dtype=float)
        self.vars_ = np.zeros_like(self.means_)

        for idx, c in enumerate(self.classes_):
            X_c = X[y == c]
            self.means_[idx, :] = X_c.mean(axis=0)
            self.vars_[idx, :] = X_c.var(axis=0) + 1e-9

        return self

    def _joint_log_likelihood(self, X):
        X = np.asarray(X, dtype=float)
        log_prob = -0.5 * (
            np.log(2.0 * np.pi * self.vars_)
            + ((X[:, np.newaxis, :] - self.means_) ** 2) / self.vars_
        )
        return log_prob.sum(axis=2) + np.log(self.class_priors_)

    def predict(self, X):
        jll = self._joint_log_likelihood(X)
        indices = np.argmax(jll, axis=1)
        return self.classes_[indices]


# Quick sanity check on a tiny synthetic dataset
X_toy = np.array(
    [
        [1.0, 2.0],
        [1.0, 3.0],
        [2.0, 2.0],
        [8.0, 8.0],
        [9.0, 8.0],
        [8.0, 9.0],
    ]
)
y_toy = np.array([0, 0, 0, 1, 1, 1])

gnb_scratch_toy = GaussianNaiveBayesScratch().fit(X_toy, y_toy)
y_toy_pred = gnb_scratch_toy.predict(X_toy)
toy_acc = accuracy_score(y_toy, y_toy_pred)
print("Toy accuracy (scratch NB):", toy_acc)
assert toy_acc > 0.8
print("Sanity check passed.")


In [None]:
# Train and evaluate the scratch implementation on the diabetes data
gnb_scratch = GaussianNaiveBayesScratch().fit(X_train, y_train)
y_pred_scratch = gnb_scratch.predict(X_test)

print("Scratch Naive Bayes accuracy:", accuracy_score(y_test, y_pred_scratch))
print("\nConfusion matrix (scratch):\n", confusion_matrix(y_test, y_pred_scratch))
print("\nClassification report (scratch):\n", classification_report(y_test, y_pred_scratch))

# Train and evaluate sklearn's GaussianNB on the same split
gnb_sklearn = GaussianNB()
gnb_sklearn.fit(X_train, y_train)
y_pred_sklearn = gnb_sklearn.predict(X_test)

print("\nsklearn GaussianNB accuracy:", accuracy_score(y_test, y_pred_sklearn))
print("\nConfusion matrix (sklearn):\n", confusion_matrix(y_test, y_pred_sklearn))
print("\nClassification report (sklearn):\n", classification_report(y_test, y_pred_sklearn))
