In [40]:
import bisect
import numpy as np

In [86]:
from collections import Counter
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split


class NaiveBayes:
    def __init__(self):
        self.priors = {}
        self.likelihoods = {}

    def build_classifier(self, train_features, train_classes):
        self.priors = Counter(train_classes)
        self.likelihoods = np.zeros(
            shape=(len(self.priors), train_features.shape[1], 4)
        )
        for features, result_class in zip(train_features, train_classes):
            for i, feature in enumerate(features):
                self.likelihoods[result_class, i, feature] += 1
        total = self.priors.total()
        for key in self.priors.keys():
            key_occurances = self.priors[key]
            self.priors[key] /= total
            for i in range(train_features.shape[1]): #Laplace smoothing
                for j in range(4):
                    self.likelihoods[key, i, j] = (self.likelihoods[key, i, j] + 1) / (key_occurances + 4)
        print(self.likelihoods)

    def find_intervals(self, train_data):
        self.intervals = np.zeros(shape=(train_data.shape[1], 3))
        for i, features in enumerate(train_data.T):
            max_value = max(features)
            min_value = min(features)
            section_size = (max_value - min_value) / 4
            first_section_limit = min_value + section_size
            second_section_limit = first_section_limit + section_size
            third_section_limit = second_section_limit + section_size
            self.intervals[i] = np.array(
                [first_section_limit, second_section_limit, third_section_limit]
            )
            
    @staticmethod
    def data_discretization(data, intervals):
        discretize = (
            lambda x: 0
            if x < intervals[0]
            else 1
            if x < intervals[1]
            else 2
            if x < intervals[2]
            else 3
        )
        return [discretize(x) for x in data]

    def predict(self, sample):
        max_probability = 0
        prediction = None
        for key in self.priors.keys():
            probability = self.priors[key]
            for i, feature in enumerate(sample):
                probability *= self.likelihoods[key, i, feature]
            if probability > max_probability:
                prediction = key
                max_probability = probability
        return prediction


class GaussianNaiveBayes:
    def __init__(self):
        self.priors = {}
        self.likelihoods = {}

    def build_classifier(self, train_features, train_classes):
        self.priors = Counter(train_classes)
        self.likelihoods = np.zeros(
            shape=(len(self.priors), train_features.shape[1], 2)
        )
        total = self.priors.total()
        for key in self.priors.keys():
            self.priors[key] /= total
            indices = np.where(train_classes == key)
            for i, feature in enumerate(train_features[indices].T):
                mean = np.mean(feature)
                deviation = np.std(feature, ddof=1) #unbiased estimator of the variance of the infinite population
                self.likelihoods[key, i, 0] = mean
                self.likelihoods[key, i, 1] = deviation

    @staticmethod
    def normal_dist(x, mean, std):
        return (
            1 / (std * np.sqrt(2 * np.sqrt(2 * np.pi))) * np.e ** (-0.5 * ((x - mean) / std) ** 2)
        )

    def predict(self, sample):
        max_probability = 0
        prediction = None
        for key in self.priors.keys():
            probability = self.priors[key]
            for i, feature in enumerate(sample):
                mean = self.likelihoods[key, i, 0]
                deviation = self.likelihoods[key, i, 1]
                probability *= GaussianNaiveBayes.normal_dist(feature, mean, deviation)
            if probability > max_probability:
                prediction = key
                max_probability = probability
        return prediction

In [74]:

iris = load_iris()

def run_naive_bayes(test_size, random_state, verbose = False, x = iris.data, y = iris.target):
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=test_size, random_state=random_state
    )

    nb = NaiveBayes()
    nb.find_intervals(x_train)
    intervals = nb.intervals
    x_train = np.array([NaiveBayes.data_discretization(features, intervals[i]) for i, features in enumerate(x_train.T)]).T
    x_test = np.array([NaiveBayes.data_discretization(features, intervals[i]) for i, features in enumerate(x_test.T)]).T
    
    if verbose:
        print(f"Testing discrete naive bayes classifier")
        print(f"test size = {test_size}; random_state = {random_state}")
        print("result of classification of the test set:")
    good = 0
    total = 0
    nb = NaiveBayes()
    nb.build_classifier(x_train, y_train)
    for test_x, test_y in zip(x_test, y_test):
        prediction = nb.predict(test_x)
        if verbose:
            print(f"Prediction: {prediction}, True class: {test_y}")
        if prediction == test_y:
            good += 1
        total += 1
    if verbose:
        print(f"Accuracy: {good/total:.3f}")
    return good/total

def run_gaussian_naive_bayes(test_size, random_state, verbose = False, x = iris.data, y = iris.target):
    x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=test_size, random_state=random_state
)
    if verbose:
        print(f"Testing gaussian naive bayes classifier")
        print(f"test size = {test_size}; random_state = {random_state}")
        print("result of classification of the test set:")
    good = 0
    total = 0
    gaussian_naive_bayes = GaussianNaiveBayes()
    gaussian_naive_bayes.build_classifier(x_train, y_train)
    
    for test_x, test_y in zip(x_test, y_test):
        prediction = gaussian_naive_bayes.predict(test_x)
        if verbose:
            print(f"Prediction: {prediction}, True class: {test_y}")
        if prediction == test_y:
            good += 1
        total += 1
    if verbose:
        print(f"Accuracy: {good/total:.3f}")
    return good/total

In [75]:
result_of_naive_bayes = run_naive_bayes(0.1, 123, verbose=True)

Testing discrete naive bayes classifier
test size = 0.1; random_state = 123
result of classification of the test set:
Prediction: 1, True class: 1
Prediction: 2, True class: 2
Prediction: 2, True class: 2
Prediction: 1, True class: 1
Prediction: 0, True class: 0
Prediction: 1, True class: 2
Prediction: 1, True class: 1
Prediction: 0, True class: 0
Prediction: 0, True class: 0
Prediction: 1, True class: 1
Prediction: 2, True class: 2
Prediction: 0, True class: 0
Prediction: 1, True class: 1
Prediction: 2, True class: 2
Prediction: 2, True class: 2
Accuracy: 0.933


In [87]:
for i in range(250, 253):
    run_naive_bayes(0.1, i, verbose=True)
    print("\n")

Testing discrete naive bayes classifier
test size = 0.1; random_state = 250
result of classification of the test set:
[[[0.69387755 0.26530612 0.02040816 0.02040816]
  [0.02040816 0.24489796 0.59183673 0.14285714]
  [0.93877551 0.02040816 0.02040816 0.02040816]
  [0.93877551 0.02040816 0.02040816 0.02040816]]

 [[0.10204082 0.57142857 0.30612245 0.02040816]
  [0.24489796 0.63265306 0.10204082 0.02040816]
  [0.02040816 0.2244898  0.73469388 0.02040816]
  [0.02040816 0.28571429 0.67346939 0.02040816]]

 [[0.04081633 0.20408163 0.51020408 0.24489796]
  [0.08163265 0.63265306 0.26530612 0.02040816]
  [0.02040816 0.02040816 0.42857143 0.53061224]
  [0.02040816 0.02040816 0.28571429 0.67346939]]]
Prediction: 1, True class: 1
Prediction: 2, True class: 2
Prediction: 0, True class: 0
Prediction: 2, True class: 1
Prediction: 2, True class: 2
Prediction: 0, True class: 0
Prediction: 1, True class: 1
Prediction: 1, True class: 1
Prediction: 1, True class: 1
Prediction: 0, True class: 0
Prediction

In [45]:
for a in range(1, 5):
    print(a)

1
2
3
4
