In [36]:
import numpy as np
from sklearn.model_selection import train_test_split

# Naive Bayes for discrete case

## Generating data for discrete Naive Bayes

In [160]:
import numpy as np
np.random.seed(42)

# Generate random features (x1 and x2) with values 0 or 1 for 1000 records and 2 features
#X = np.random.choice([0, 1], size=(1000, 100))
# Generate random binary features (x1 and x2) with values 0 or 1 for 1000 records and 100 features
binary_features = np.random.choice([0, 1], size=(1000, 100))

# Generate random continuous features (x3 and x4) with values from a normal distribution for 1000 records and 100 features
continuous_features = np.random.randn(1000, 50)

# Concatenate binary and continuous features along the columns
X = np.concatenate([binary_features, continuous_features], axis=1)

# Generate random binary labels (y)
y = np.random.choice([0, 1], size=1000)

# Display a few samples
print("Sample features:")
print(X[:5])
print("\nSample labels:")
print(y[:5])

Sample features:
[[ 0.          1.          0.          0.          0.          1.
   0.          0.          0.          1.          0.          0.
   0.          0.          1.          0.          1.          1.
   1.          0.          1.          0.          1.          1.
   1.          1.          1.          1.          1.          1.
   0.          0.          1.          1.          1.          0.
   1.          0.          0.          0.          0.          0.
   1.          1.          1.          1.          1.          0.
   1.          1.          0.          1.          0.          1.
   0.          1.          1.          0.          0.          0.
   0.          0.          0.          0.          0.          1.
   1.          0.          1.          1.          1.          1.
   0.          1.          0.          1.          1.          1.
   0.          1.          0.          1.          0.          1.
   0.          0.          1.          0.          1.      

## Algorithm for Discrete Naive Bayes

In [161]:
from sklearn.model_selection import train_test_split
from scipy.stats import bernoulli
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import norm
from scipy import stats
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from mpl_toolkits.mplot3d import Axes3D
from dataclasses import dataclass

class NaiveBayes:
    def __init__(self, X, y, alpha=1):  # Adding alpha parameter for Laplace smoothing. Alpha=0 implies no laplace smoothing is applied.
        self.X = X
        self.y = y
        self.alpha = alpha

    def dataSplit(self):
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y,
                                                            test_size=0.3,
                                                            random_state=24,
                                                            shuffle=True)
        return X_train, X_test, y_train, y_test

    def fitDistribution (self, X):
        
        #prob = (np.sum(X) + self.alpha) / (len(X) + 2 * self.alpha) # Multiplying alpha by 2 as we have 2 classes
        # Bernoulli distribution
        
        #dist = bernoulli(prob)
        #return dist
        # Check if the feature is binary or continuous
        if len(np.unique(X)) == 2:
            # Binary feature, use Bernoulli distribution
            # Calculate the probability of 1 in the binary data with Laplace smoothing
            prob = (np.sum(X) + self.alpha) / (len(X) + 2 * self.alpha)
            # Since X can be 0, 1 - this is a case of bernoulli distribution
            dist = bernoulli(prob)
            return ('d',dist)
        else:
            # Continuous feature, use Gaussian distribution
            mean, std = np.mean(X), np.std(X)
            dist = norm(mean, std)
            return('c',dist)
        

    def posterior(self, x, dist_x, prior,c,dist_type):
        # p(yi=1 | xi) = p(x1i|y=1).p(x2i|y=1).p(y=1)
        #return prior * dist1.pmf(x[0]) * dist2.pmf(x[1])
        
        p=1
        for i in range(len(x)):
            dist_key = f'dist_X{c}{i}'
            if dist_type.get(f'type_X{c}{i}')=='d':
                # Binary feature, use pmf
                p*=dist_x[dist_key].pmf(x[i])

            else:
                # Continuous feature, use pdf
                    p*=dist_x[dist_key].pdf(x[i])
    
            
        return prior * p

    def fit(self):
        X_train, X_test, y_train, y_test = self.dataSplit()
        dist_dict_0={} #stores distribution of class 0
        dist_dict_1={}#stores distribution of class 1
        X_class={} #splits the features for each class 
        prior_class={}
        dist_type={} #collects the distribution type for each feature to help in decision making between pdf or pmf

        for i in np.unique(y_train):
            X_class[f'X{i}']=X_train[y_train == i]
            prior_class[f'prior_{i}'] = X_class[f'X{i}'].shape[0] / X_train.shape[0]
        #X0 = X_train[y_train == 0]
        #X1 = X_train[y_train == 1]

        #prior_0 = X0.shape[0] / X_train.shape[0]
        #prior_1 = X1.shape[0] / X_train.shape[0]

        # Calculating p(xi|y=k)
        for i in np.unique(y_train):
            for j in range(X_train.shape[1]):
                if i==0:
                    dist_type[f'type_X{i}{j}'],dist_dict_0[f'dist_X{i}{j}']=self.fitDistribution(X_class[f'X{i}'][:, j]) # function fitdistribution return the feature type and distribution of the individual feature
                else:
                    dist_type[f'type_X{i}{j}'],dist_dict_1[f'dist_X{i}{j}']=self.fitDistribution(X_class[f'X{i}'][:, j])
        #dist_X00 = self.fitDistribution(X0[:, 0])
        #dist_X01 = self.fitDistribution(X0[:, 1])

        #dist_X10 = self.fitDistribution(X1[:, 0])
        #dist_X11 = self.fitDistribution(X1[:, 1])

        correct_predictions = 0

        for rec, y in zip(X_test, y_test):
            #p_y0 = self.posterior(rec, dist_dict['dist_X00'], dist_dict['dist_X01'], prior_class['prior_0'])
            #p_y1 = self.posterior(rec, dist_dict['dist_X10'], dist_dict['dist_X11'], prior_class['prior_1'])
            p_y0 = self.posterior(rec, dist_dict_0, prior_class['prior_0'],0,dist_type) #function passes the test record ,distribution ,prior value ,class and feature type dictionary to calculate the posterioir 
            p_y1 = self.posterior(rec, dist_dict_1, prior_class['prior_1'],1,dist_type)

            predicted_class = np.argmax([p_y0, p_y1])
            print(f"P(y=1|{rec}) = {p_y1 * 100}")
            print(f"P(y=0|{rec}) = {p_y0 * 100}")
            print(f"Model predicted class {predicted_class} and data was class {y}")

            if predicted_class == y:
                correct_predictions += 1
        accuracy = correct_predictions / len(y_test)
        print(f"\nModel Accuracy: {accuracy * 100:.2f}%")

### Running on data using laplace

In [162]:
discreteNB = NaiveBayes(X, y)
discreteNB.fit()

P(y=1|[ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  1.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
  1.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  1.00000000e+00
  0.00000000e+00  1.00000000e+00  0.00000000e+00  1.00000000e+00
  0.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
  0.00000000e+00  1.00000000e+00  0.00000000e+00  1.00000000e+00
  0.00000000e+00  1.00000000e+00  1.00000000e+00  1.00000000e+00
  1.00000000e+00  0.00000000e+00  0.00000000e+00  1.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  1.00000000e+00
  0.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
  1.00000000e+00  1.00000000e+00  1.00000000e+00  0.00000000e+00
  0.00000000e+00  1.00000000e+00  1.00000000e+00  1.00000000e+00
  1.00000000e+00  1.00000000e+00  1.00000000e+00  0.00000000e+00
  1.00000000e+00  1.00000000e+00  1.00000000e+00  0.00000000e+00
  0.00000000e+00  0

### Without laplace

In [163]:
discreteNB = NaiveBayes(X, y, alpha=0)
discreteNB.fit()

P(y=1|[ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  1.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
  1.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  1.00000000e+00
  0.00000000e+00  1.00000000e+00  0.00000000e+00  1.00000000e+00
  0.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
  0.00000000e+00  1.00000000e+00  0.00000000e+00  1.00000000e+00
  0.00000000e+00  1.00000000e+00  1.00000000e+00  1.00000000e+00
  1.00000000e+00  0.00000000e+00  0.00000000e+00  1.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  1.00000000e+00
  0.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
  1.00000000e+00  1.00000000e+00  1.00000000e+00  0.00000000e+00
  0.00000000e+00  1.00000000e+00  1.00000000e+00  1.00000000e+00
  1.00000000e+00  1.00000000e+00  1.00000000e+00  0.00000000e+00
  1.00000000e+00  1.00000000e+00  1.00000000e+00  0.00000000e+00
  0.00000000e+00  0

We don't see a difference as all combinations are present in training