### Gaussian Naive Bayes
--------------

In the following code, i have implemented a **Gaussian Naive Bayes classifier** using the **Wine dataset**. The data were loaded from **Google Drive**, split into **training and test sets with stratification**, and **normalized using StandardScaler**. For each class, i computed the **mean**, **variance**, and the **prior probability**, assuming that the features follow a **Gaussian distribution** and are **conditionally independent**. For prediction, we applied the **MAP (Maximum A Posteriori) rule** in **logarithmic form** to avoid **numerical underflow** issues. Finally, i evaluated the model using **accuracy** and a **classification report**, and verified the correctness of the implementation by comparing the results with **sklearn’s GaussianNB**.


In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

from google.colab import drive

#----------------------------------------------------
#LOAD
#----------------------------------------------------

drive.mount('/content/drive')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

file_path = '/content/drive/MyDrive/wine_dataset.csv'

if not os.path.exists(file_path):
    print("File is not in Drive. Importing from sklearn...")
    raw_data = load_wine()
    df = pd.DataFrame(raw_data.data, columns=raw_data.feature_names)
    df['target'] = raw_data.target
    df.to_csv(file_path, index=False)
    print(f"Dataset saved in: {file_path}")
else:
    print(f"File found in: {file_path}")

df = pd.read_csv(file_path)

X = df.drop('target', axis=1).values
y = df['target'].values


X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test = scaler.transform(X_test_raw)

print(f"Train: {X_train.shape}, Test: {X_test.shape}")
print("-" * 60)

#--------------------------------------------------------
#GAUSSIAN NAÏVE BAYES
#--------------------------------------------------------
class GaussianNaiveBayes:
    def fit(self, X, y):

        n_samples, n_features = X.shape
        self.classes = np.unique(y)
        n_classes = len(self.classes)

        #init table for mean and variance
        #class number x number of Features
        self.mean = np.zeros((n_classes, n_features))
        self.var = np.zeros((n_classes, n_features))
        self.priors = np.zeros(n_classes)

        for idx, c in enumerate(self.classes):
            #filter data of the class
            X_c = X[y == c]

            self.mean[idx, :] = X_c.mean(axis=0)
            self.var[idx, :] = X_c.var(axis=0)

            #prior probability of class (P(y))
            self.priors[idx] = X_c.shape[0] / float(n_samples)

    def _calculate_likelihood(self, class_idx, x):
        mean = self.mean[class_idx]
        var = self.var[class_idx]


        epsilon = 1e-9

        numerator = np.exp(-((x - mean) ** 2) / (2 * (var + epsilon)))
        denominator = np.sqrt(2 * np.pi * (var + epsilon))

        return numerator / denominator

    def predict(self, X):
        y_pred = []

        for x in X:
            posteriors = []

            for idx, c in enumerate(self.classes):
                #Log to avoid underflow
                #Log(Posterior) = Log(Prior) + Sum(Log(Likelihoods))

                prior = np.log(self.priors[idx])
                likelihoods = self._calculate_likelihood(idx, x)

                #indipendent features (διαγώνιος πίνακας συμμεταβλητότητας)
                log_likelihood = np.sum(np.log(likelihoods + 1e-9))

                posteriors.append(prior + log_likelihood)

            #class with max probability
            y_pred.append(self.classes[np.argmax(posteriors)])

        return np.array(y_pred)

# ---------------------------------------------------------
#EXECUTION & EVALUATION
# ---------------------------------------------------------

model = GaussianNaiveBayes()
model.fit(X_train, y_train)

# predictions
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

# Evaluation
print("\n--- Custom Gaussian Naïve Bayes ---")
print(f"Train Accuracy: {accuracy_score(y_train, train_preds):.4f}")
print(f"Test Accuracy : {accuracy_score(y_test, test_preds):.4f}")

print("\nClassification Report (Test Set):")
print(classification_report(y_test, test_preds))

# --- SKLEARN  ---
print("\n--- sklearn GaussianNB ---")
from sklearn.naive_bayes import GaussianNB
sk_model = GaussianNB()
sk_model.fit(X_train, y_train)
sk_preds = sk_model.predict(X_test)
print(f"Sklearn Test Accuracy: {accuracy_score(y_test, sk_preds):.4f}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
File found in: /content/drive/MyDrive/wine_dataset.csv
Train: (142, 13), Test: (36, 13)
------------------------------------------------------------

--- Custom Gaussian Naïve Bayes ---
Train Accuracy: 0.9789
Test Accuracy : 0.9722

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        12
           1       1.00      0.93      0.96        14
           2       1.00      1.00      1.00        10

    accuracy                           0.97        36
   macro avg       0.97      0.98      0.97        36
weighted avg       0.97      0.97      0.97        36


--- sklearn GaussianNB ---
Sklearn Test Accuracy: 0.9722
