## Minimal Naive Bayes Classifier Implementation

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.base import BaseEstimator

In [21]:
class NaiveBayesClassifier(BaseEstimator):
    def __init__(self):
        self._n_classes = np.nan
        self._features_num = np.nan
        self._mean = np.nan
        self._var = np.nan
        self._priors = np.nan

    def gaussian_pdf(self, x, cls_id):
        return np.exp(-0.5 * (((x - self._mean[cls_id]) ** 2) / self._var[cls_id])) / (
            np.sqrt(2 * np.pi * self._var[cls_id])
        )

    def fit(self, X, y):
        assert X.shape[0] == y.shape[0]
        self._features_num = X.shape[1]

        self._classes = np.unique(y)
        self._mean = np.zeros((self._classes.shape[0], X.shape[1]), dtype=np.float64)
        self._var = np.zeros((self._classes.shape[0], X.shape[1]), dtype=np.float64)
        self._priors = np.zeros((self._classes.shape[0],), dtype=np.float64)

        for i, cls in enumerate(self._classes):
            self._mean[i, :] = X[y == cls].mean(axis=0)
            self._var[i, :] = X[y == cls].var(axis=0)
            self._priors[i] = X[y == cls].shape[0] / X.shape[0]

    def predict_proba_one(self, x):
        probs = np.zeros_like(self._classes)

        for i, cls in enumerate(self._classes):
            prior = np.log(self._priors[i])
            posterior = np.sum(np.log(self.gaussian_pdf(x, i)))
            probs[i] = prior + posterior

        probs = np.exp(probs)
        probs /= np.sum(probs)

        return probs

    def predict_proba(self, X):
        assert X.shape[1] == self._features_num
        return np.apply_along_axis(self.predict_proba_one, axis=1, arr=X)

    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)


In [22]:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=10_000, n_features=10, n_classes=3, n_informative=8)

In [23]:
clf = NaiveBayesClassifier()
clf.fit(X, y)

In [24]:
from sklearn.metrics import accuracy_score
accuracy_score(y, clf.predict(X))

0.7589

In [25]:
from sklearn.model_selection import cross_val_score
from scipy.stats import hmean
hmean(cross_val_score(NaiveBayesClassifier(), X, y, scoring='accuracy', cv=10, n_jobs=-1))

0.7582719027026683

In [26]:
from sklearn.naive_bayes import GaussianNB as sk_clf
hmean(cross_val_score(sk_clf(), X, y, scoring='accuracy', cv=10, n_jobs=-1))

0.7666419957666081