In [7]:
import numpy as np
import pandas as pd
from statistics import mean, variance
import math
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [3]:
def bayesian_univariate_classifier(data, test_sample):
    y = sorted(list(set([label for _, label in data])))
    total_samples = len(data)

    ## 1. Calculate Prior Probability P(Class)

    class_counts = {}
    for _, label in data:
        class_counts[label] = class_counts.get(label, 0) + 1

    prior_p = {} # {class_label: prior_probability}
    for cur_class in y:
        prior_p[cur_class] = class_counts[cur_class] / total_samples

    ## 2. Calculate Class Parameters (Mean u and Variance var)

    u = [] # Mean for each class
    var = [] # Variance for each class

    for cur_class in y:
        elements = [feature for feature, label in data if label == cur_class]

        u.append(mean(elements))
        # Ensure variance is not zero for stability
        class_variance = variance(elements)
        var.append(class_variance if class_variance > 1e-6 else 1e-6)


    ## 3. Compute Likelihood P(Data|Class)

    l = [] # Likelihood for each class

    for i, cur_class in enumerate(y):
        mu_i = u[i]
        sigma2_i = var[i]

        # Gaussian Likelihood (PDF): 1/sqrt(2*pi*sigma^2) * exp(-((x-mu)^2)/(2*sigma^2))
        exponent = -((test_sample - mu_i)**2) / (2 * sigma2_i)
        denominator = math.sqrt(2 * math.pi * sigma2_i)

        likelihood = (1 / denominator) * math.exp(exponent)
        l.append(likelihood)

    ## 4. Compute Posterior Score P(Data|Class) * P(Class)

    posterior_scores = {}

    for i, cur_class in enumerate(y):
        posterior_scores[cur_class] = l[i] * prior_p[cur_class]

    ## 5. Final Classification

    predicted_class = max(posterior_scores, key=posterior_scores.get)

    print(f"--- Classification Results for x = {test_sample} ---")
    for cur_class in y:
        print(f"Class {cur_class}: Score = {posterior_scores[cur_class]:.4e}")
    print(f"Predicted Class: {predicted_class}")

    return predicted_class



In [4]:

training_data = [
    (10,0),
    (12,0),
    (9,0),
    (11,0),
    (13.5,0),
    (20,1),
    (18,1),
    (21,1),
    (19.5,1),
    (22,1)
]

# Test a new sample
test_value = 15.0
classification_result = bayesian_univariate_classifier(training_data, test_value)

--- Classification Results for x = 15.0 ---
Class 0: Score = 9.4372e-03
Class 1: Score = 4.6064e-04
Predicted Class: 0


In [16]:
from numpy.linalg import det, inv

def multivariate_gaussian_pdf(x, mu, Sigma):
    """
    Calculates the Multivariate Gaussian Probability Density Function (PDF).

    P(x|Class) = 1 / (sqrt((2*pi)^D * |Sigma|)) * exp(-0.5 * (x-mu)^T * Sigma^-1 * (x-mu))
    """
    D = len(x)  # Dimension

    # Vector difference
    x_minus_mu = x - mu

    # Determinant and Inverse of Covariance Matrix
    det_Sigma = det(Sigma)

    # Handle singularity (e.g., if one class has too few samples)
    if det_Sigma <= 0:
        # Add a small regularization term (Lidstone smoothing)
        Sigma += np.eye(D) * 1e-6
        det_Sigma = det(Sigma)

    inv_Sigma = inv(Sigma)

    # Exponent: -0.5 * (x-mu)^T * Sigma^-1 * (x-mu)
    exponent_term = -0.5 * x_minus_mu.T @ inv_Sigma @ x_minus_mu

    # Normalization constant: 1 / sqrt((2*pi)^D * |Sigma|)
    normalization = 1.0 / np.sqrt((2 * math.pi)**D * det_Sigma)

    return normalization * np.exp(exponent_term)

def bayesian_multivariate_classifier(X_train, y_train, test_sample):
    """
    Performs Multivariate Gaussian Classification on a training set.
    X_train: Feature matrix (DataFrame or array)
    y_train: Label vector (Series or array)
    test_sample: A single feature vector (numpy array) to classify.
    """

    y = sorted(y_train.unique())
    total_samples = len(y_train)

    posterior_scores = {}

    for cur_class in y:
        # 1. Separate Data
        X_c = X_train[y_train == cur_class]

        # 2. Calculate Prior P(Class)
        prior_p = len(X_c) / total_samples

        # 3. Calculate Class Parameters
        # Calculate Mean Vector (mu)
        mu = X_c.mean(axis=0).to_numpy()

        # Calculate Covariance Matrix (Sigma). bias=True for sample covariance.
        # np.cov needs samples to be rows, so we transpose the matrix
        Sigma = np.cov(X_c.T, bias=True)

        # 4. Compute Likelihood P(x|Class)
        likelihood = multivariate_gaussian_pdf(test_sample, mu, Sigma)

        # 5. Compute Posterior Score
        posterior_scores[cur_class] = likelihood * prior_p

    # 6. Final Classification
    predicted_class = max(posterior_scores, key=posterior_scores.get)

    # Output results
    print(f"--- Classification Results for test sample {test_sample} ---")
    for cur_class in y:
        score = posterior_scores[cur_class]
        print(f"Class {cur_class}: Posterior Score (L*P) = {score:.4e}")
    print(f"Predicted Class: {predicted_class}")

    return predicted_class

In [17]:
df = pd.read_csv('./Heart Disease dataset/diabetes.csv')
print(df.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [18]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']
test_vector = np.array([
    2.5,  # Pregnancies
    110,  # Glucose
    70,   # BloodPressure
    25,   # SkinThickness
    80,   # Insulin
    30,   # BMI
    0.4,  # DiabetesPedigreeFunction
    35    # Age
])

# Perform Classification
classification_result = bayesian_multivariate_classifier(X, y, test_vector)

--- Classification Results for test sample [  2.5 110.   70.   25.   80.   30.    0.4  35. ] ---
Class 0: Posterior Score (L*P) = 1.0609e-11
Class 1: Posterior Score (L*P) = 5.8347e-13
Predicted Class: 0
