In [1]:
import numpy as np

In [37]:
X_train = np.array([
    [0,1,1],
    [0,0,1],
    [0,0,0],
    [1,1,0]
])
Y_train = ['Y', 'N', 'Y', 'Y']
X_test = np.array([[1,1,0]])

In [24]:
def get_label_indices(labels):
    """
    Group samples based on their labels and return indices
    @param label: list of labels
    @return: dict, {class1: [indices], class2: [indices]}
    """
    from collections import defaultdict
    label_indices = defaultdict(list)
    for index, label in enumerate(labels):
        label_indices[label].append(index)
    return label_indices

In [25]:
label_indices = get_label_indices(Y_train)
label_indices

defaultdict(list, {'Y': [0, 2, 3], 'N': [1]})

In [26]:
def get_prior(label_indices):
    """
    Compute prior based on training samples
     @param label_indices: grouped sample indices by class
     @return: dictionary, with class label as key, corresponding
     prior as the value
    """
    prior = {label: len(indices) for label, indices in label_indices.items()}
    total_count = sum(prior.values())
    for label in prior:
        prior[label] /= total_count
    return prior

        

In [27]:
prior = get_prior(label_indices)
prior

{'Y': 0.75, 'N': 0.25}

In [28]:
def get_likelihood(features, label_indices, smoothing=0):
    """
    Compute likelihood based on training samples
    @param features: matrix of features
    @param label_indices: grouped sample indices by class
    @param smoothing: integer, additive smoothing parameter
    @return: dictionary, with class as key, corresponding
    conditional probability P(feature|class) vector
    as value
    """
    likelihood = {}
    for label, indices in label_indices.items():
        likelihood[label] = features[indices, :].sum(axis=0) + smoothing
        total_count = len(indices)
        likelihood[label] = likelihood[label] / (total_count + 2 * smoothing)
    return likelihood

In [29]:
smoothing = 1
likelihood = get_likelihood(X_train, label_indices, smoothing)
likelihood

{'Y': array([0.4, 0.6, 0.4]), 'N': array([0.33333333, 0.33333333, 0.66666667])}

In [None]:
def get_posterior(X, prior, likelihood):
    """
    Compute posterior of testing samples, based on prior and
    likelihood
    @param X: testing samples
    @param prior: dictionary, with class label as key,
    corresponding prior as the value
    @param likelihood: dictionary, with class label as key,
    corresponding conditional probability
    vector as value
    @return: dictionary, with class label as key, corresponding
    posterior as value
    """
    posteriors = []
    for x in X:
        #posterior is proportional to prior * likelihood
        posterior = prior.copy()
        for label, likelihood_label in likelihood.items():
            for index, bool_value in enumerate(x):
                posterior[label] *= likelihood_label[index] if bool_value else (1 - likelihood_label[index])
        #normalize so that all sums up to 1
        sum_posterior = sum(posterior.values())
        for label in posterior:
            if posterior[label] == float('inf'):
                posterior[label] = 1.0
            else:
                posterior[label] /= sum_posterior
        posteriors.append(posterior.copy())
    return posteriors

In [38]:
posterior = get_posterior(X_test, prior, likelihood)
posterior

[{'Y': 0.9210360075805433, 'N': 0.07896399241945673}]

## Implementing Naive Bayes with sklearn

In [39]:
from sklearn.naive_bayes import BernoulliNB

In [40]:
clf = BernoulliNB(alpha=1.0, fit_prior=True)
clf.fit(X_train, Y_train)

BernoulliNB()

In [41]:
pred_prob = clf.predict_proba(X_test)
pred_prob

array([[0.07896399, 0.92103601]])

In [42]:
pred = clf.predict(X_test)
pred

array(['Y'], dtype='<U1')