In [129]:
import numpy as np
import sklearn
from matplotlib import pyplot as plt
from sklearn.datasets import load_iris
import pandas as pd
%matplotlib widget

In [130]:
data = load_iris()
dir(data)
features = data['data']
target = data['target'][:,np.newaxis]

dataframe = pd.DataFrame(
    np.concatenate((features, target, data['target_names'][target]), axis = 1),
    columns = data['feature_names'] + ['target labels', 'target names'] 
)
dataframe.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target labels,target names
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa


In [131]:
class NormalDistr:

    def __init__(self, features: np.ndarray) -> None:
        self.mean = features.mean(axis=0)
        self.std = features.std(axis=0)

    def logprob(self, value: np.ndarray) -> np.ndarray:
        res = -0.5*np.log(2*np.pi*self.std**2) - 2*(value - self.mean)**2 / (2 * self.std**2)
        return res
    def prob(self, value: np.ndarray) -> np.ndarray:
        return np.exp(self.logprob(value))

NormalDistr(features)

<__main__.NormalDistr at 0x24789a8f230>

In [132]:
from sklearn.base import BaseEstimator, ClassifierMixin

class Bayes(ClassifierMixin, BaseEstimator):

    def fit(self, X, y, weights=None, distribution=None):

        self.labels = np.unique(y)
        if distribution is None:
            self.distribution = NormalDistr
        else:
            self.distribution = distribution

        self.fitted = {
            lbl: self.distribution(X[y == lbl, :])
            for lbl in self.labels
        }
        self.cat_probs = {
            lbl: (y == lbl).sum()/len(self.labels)
            for lbl in self.labels
        }

    def predict_log_proba(self, X):
        res = np.zeros((X.shape[0], len(self.labels)))
        for ind, lbl in enumerate(self.labels):
            res[:,ind] = self.fitted[lbl].logprob(X).sum(axis=1) - np.full(X.shape[0], self.cat_probs[lbl])
        return res


    def predict_proba(self, X):
        return np.exp(self.predict_log_proba(X))
    
    def predict(self, X):
        
        return self.labels[self.predict_proba(X).argmax(axis=1)]

In [133]:
nb = Bayes()
nb.fit(features, dataframe["target labels"])
print('log probas:\n{}'.format(nb.predict_log_proba(features[-2:])))
print('predicted labels:\n{}'.format(nb.predict(features[-2:])))
print('\nIt`s alive! More tests coming.')
print(dataframe['target labels'].shape)
print(f'{features.shape=}')

log probas:
[[-938.32133103  -51.24526701  -20.08435531]
 [-691.82628217  -25.78206256  -19.40749502]]
predicted labels:
['2' '2']

It`s alive! More tests coming.
(150,)
features.shape=(150, 4)


In [134]:
from sklearn.model_selection import train_test_split
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.25)

print(features_train.shape, target_train.reshape(-1).shape)
print(features_train)

(112, 4) (112,)
[[4.9 2.5 4.5 1.7]
 [5.5 2.4 3.7 1. ]
 [6.2 3.4 5.4 2.3]
 [6.9 3.1 5.4 2.1]
 [7.2 3.6 6.1 2.5]
 [6.1 2.8 4.7 1.2]
 [6.3 3.3 4.7 1.6]
 [6.3 3.3 6.  2.5]
 [6.1 3.  4.9 1.8]
 [6.4 2.8 5.6 2.2]
 [6.4 3.2 4.5 1.5]
 [6.3 2.5 4.9 1.5]
 [4.6 3.6 1.  0.2]
 [5.6 2.9 3.6 1.3]
 [5.8 2.7 5.1 1.9]
 [5.7 3.8 1.7 0.3]
 [7.2 3.2 6.  1.8]
 [5.1 3.4 1.5 0.2]
 [5.1 3.8 1.6 0.2]
 [6.  3.4 4.5 1.6]
 [6.1 2.9 4.7 1.4]
 [5.7 2.6 3.5 1. ]
 [4.8 3.4 1.9 0.2]
 [5.1 3.5 1.4 0.3]
 [6.2 2.9 4.3 1.3]
 [4.4 2.9 1.4 0.2]
 [5.8 2.7 3.9 1.2]
 [6.9 3.2 5.7 2.3]
 [4.6 3.4 1.4 0.3]
 [5.5 2.5 4.  1.3]
 [7.3 2.9 6.3 1.8]
 [7.9 3.8 6.4 2. ]
 [5.1 2.5 3.  1.1]
 [5.7 4.4 1.5 0.4]
 [5.2 2.7 3.9 1.4]
 [4.9 3.1 1.5 0.2]
 [5.1 3.8 1.5 0.3]
 [5.  3.3 1.4 0.2]
 [5.5 3.5 1.3 0.2]
 [5.9 3.2 4.8 1.8]
 [5.1 3.3 1.7 0.5]
 [5.  3.4 1.6 0.4]
 [5.7 2.5 5.  2. ]
 [6.7 3.3 5.7 2.1]
 [7.4 2.8 6.1 1.9]
 [5.1 3.8 1.9 0.4]
 [4.6 3.2 1.4 0.2]
 [5.1 3.7 1.5 0.4]
 [6.3 2.7 4.9 1.8]
 [6.3 2.8 5.1 1.5]
 [5.  2.  3.5 1. ]
 [7.7 3.  6.1 2

In [135]:
nb = Bayes()
nb.fit(features_train, target_train.reshape(-1))
nb_test_log_proba = nb.predict_log_proba(features_test)

In [136]:
print('Naive Bayes classifier accuracy on the train set: {}'.format(nb.score(features_train, target_train.reshape(-1))))

Naive Bayes classifier accuracy on the train set: 0.9642857142857143


In [137]:
print('Naive Bayes classifier accuracy on the test set: {}'.format(nb.score(features_test, target_test.reshape(-1))))

Naive Bayes classifier accuracy on the test set: 0.9473684210526315


In [138]:
from sklearn import naive_bayes

sklearn_nb = naive_bayes.GaussianNB()
sklearn_nb.fit(features_train, target_train)
sklearn_nb_test_log_proba = sklearn_nb.predict_log_proba(features_test)

  y = column_or_1d(y, warn=True)


In [139]:
print('sklearn implementation accuracy on the train set: {}'.format(sklearn_nb.score(features_train, target_train)))

sklearn implementation accuracy on the train set: 0.9642857142857143


In [140]:
print('sklearn implementation accuracy on the test set: {}'.format(sklearn_nb.score(features_test, target_test)))

sklearn implementation accuracy on the test set: 0.9473684210526315
