In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

# ------------------------------
# Step 1: Read and Preprocess Data
# ------------------------------
df = pd.read_csv("pima-indians-diabetes.csv")

# Separate features and label
features = df.drop(columns=["1"])
labels = df["1"]

# Replace 0s with column mean for selected columns (exclude columns where 0 is valid like pregnancies)
cols_to_replace_zero = ["6", "148", "72", "35", "0", "33.6", "0.627", "50"]
for col in cols_to_replace_zero:
    col_mean = features[features[col] != 0][col].mean()
    features[col] = features[col].replace(0, col_mean)

X = features.to_numpy()
y = labels.to_numpy()

# ------------------------------
# Step 2: Helper functions
# ------------------------------

def gaussian_prob(x, mean, std):
    """Probability density of normal distribution"""
    if std == 0:
        return 1.0 if x == mean else 0.0  # Avoid divide by zero
    exponent = np.exp(-((x - mean) ** 2) / (2 * std ** 2))
    return (1 / (np.sqrt(2 * np.pi) * std)) * exponent

def compute_stats(X, y):
    """Compute P(y), mean and std for each feature given class"""
    stats = {}
    for cls in [0, 1]:
        X_cls = X[y == cls]
        cls_mean = X_cls.mean(axis=0)
        cls_min = X_cls.min(axis=0)
        cls_max = X_cls.max(axis=0)
        cls_std = (cls_max - cls_min) / 6
        stats[cls] = {
            "mean": cls_mean,
            "std": cls_std
        }
    prior_0 = np.mean(y == 0)
    prior_1 = np.mean(y == 1)
    return stats, prior_0, prior_1

def predict(X_test, stats, prior_0, prior_1):
    """Use Gaussian Naive Bayes with estimated stats to classify"""
    preds = []
    for x in X_test:
        prob_0 = prior_0
        prob_1 = prior_1
        for i in range(len(x)):
            prob_0 *= gaussian_prob(x[i], stats[0]['mean'][i], stats[0]['std'][i])
            prob_1 *= gaussian_prob(x[i], stats[1]['mean'][i], stats[1]['std'][i])
        pred = 1 if prob_1 > prob_0 else 0
        preds.append(pred)
    return np.array(preds)

# ------------------------------
# Step 3: Cross-Validation
# ------------------------------

accuracies = []
conf_matrix_total = np.array([[0, 0], [0, 0]])

for _ in range(5):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
    
    stats, prior_0, prior_1 = compute_stats(X_train, y_train)
    y_pred = predict(X_test, stats, prior_0, prior_1)
    
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    accuracies.append(acc)
    conf_matrix_total += cm

# ------------------------------
# Step 4: Output Final Metrics
# ------------------------------
average_accuracy = np.mean(accuracies)
average_conf_matrix = conf_matrix_total // 5  # Integer division

print("Average Accuracy over 5 runs:", average_accuracy)
print("Average Confusion Matrix:")
print(average_conf_matrix)


Average Accuracy over 5 runs: 0.7597402597402597
Average Confusion Matrix:
[[86 14]
 [23 31]]
