# Implementing Naive Bayes from Scratch

In [1]:
import numpy as np
from collections import defaultdict

### 1. Create toy dataset

In [2]:
X_train = np.array([
    [0, 1, 1],
    [0, 0, 1],
    [0, 0, 0],
    [1, 1, 0]])

Y_train = ['Y', 'N', 'Y', 'Y']
X_test = np.array([[1, 1, 0]])

In [3]:
X_train

array([[0, 1, 1],
       [0, 0, 1],
       [0, 0, 0],
       [1, 1, 0]])

In [4]:
Y_train

['Y', 'N', 'Y', 'Y']

### 2. Group the data by their label and record indices 

In [6]:
def get_label_indices(labels):
    """Group samples based on their labels and return indices

    Args:
        labels (list) : list of labels
    
    Returns:
        dict: {class1: [indices], class2: [indices]}
    """
    label_indices = defaultdict(list)

    for index, label in enumerate(labels):
        label_indices[label].append(index)
    return label_indices


In [7]:
label_indices = get_label_indices(Y_train)
label_indices

defaultdict(list, {'Y': [0, 2, 3], 'N': [1]})

### 3. Calculate prior $P(N)$ and $P(Y)$

In [15]:
def get_prior(label_indices):
    """Computes prior based on training samples

    Args:
        label_indices : grouped sample indices by class
    
    Returns:
        prior: dictionary, with class label as key, corresponding prior as the value
    """

    prior = {label: len(indices) for label, indices in label_indices.items()}
    total_count = sum(prior.values())

    for label in prior:
        prior[label] /= total_count
    
    return prior

In [18]:
prior = get_prior(label_indices)
prior

{'Y': 0.75, 'N': 0.25}

### 4. Calculate likelihood $P(x | y_{k})$

In [25]:
def get_likelihood(features, label_indices, smoothing=0):
    """Compute likelihood based on training samples

    Args:
        features (matrix): matrix of features
        label_indices (dict): grouped sample indices by class
        smoothing (int, optional): additive smoothing parameter. Defaults to 0.
    Returns:
        likelihood: dictionary, with class as key, corresponding conditional probability P(feature|class) vector as value
    """

    likelihood = {}
    for label, indices in label_indices.items():
        likelihood[label] = features[indices, :].sum(axis=0) + smoothing
        total_count = len(indices)
        likelihood[label] = likelihood[label] / (total_count + 2 * smoothing)
    return likelihood

In [27]:
smoothing = 1
likelihood = get_likelihood(X_train, label_indices, smoothing)
print(likelihood)

{'Y': array([0.4, 0.6, 0.4]), 'N': array([0.33333333, 0.33333333, 0.66666667])}


### 5. Calculate posterior probability $P(y_{k} | x)$

In [None]:
def get_posterior(X, prior, likelihood):
    """Compute posterior of testing samples, based on prior and likelihood

    Args:
        X (_type_): testing samples
        prior (dict): dictionary, with class label as key, corresponding prior as the value
        likelihood (dict): dictionary, with class label as key, corresponding conditional probability vector as value
    
    Returns:
        posteriors: dictionary, with class label as key, corresponding posterior as value
    """

    posteriors = []
    for x in X:
        # posterior is proportional to prior * likelihood
        posterior = prior.copy()
        for label, likelihood_label in likelihood.items():
            for index, bool_value in enumerate(x):
                posterior[label] *= likelihood_label[index]


In [33]:
X_test

array([[1, 1, 0]])