In [5]:
import math
import torch
import torch.nn as nn

In [2]:
import pandas as pd

path = 'C:/Users/jeanb/Documents/Python Scripts/DiveIntoDL/data/'
data = pd.read_csv(path + 'iris.csv')
data.shape # 150, 5
data.keys() # 'sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'
features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
label = 'species'
train_idx = list(range(0, 40)) + list(range(50, 90)) + list(range(100, 140))
val_idx = list(range(40, 50)) + list(range(90, 100)) + list(range(140, 150))
type(data['sepal_length'][val_idx])

# from text labels to id labels
label_idx = {}
idx = 0
for i in range(data.shape[0]):
    label_name = data[label][i]
    if label_name not in label_idx:
        label_idx[label_name] = idx
        idx += 1
print(label_idx)

{'setosa': 0, 'versicolor': 1, 'virginica': 2}


In [None]:
X_train = torch.zeros((len(train_idx), len(features)))
y_train = torch.zeros(len(train_idx))

X_val = torch.zeros((len(val_idx), len(features)))
y_val = torch.zeros(len(val_idx))

for i, k in enumerate(features):
    X_train[:, i] = torch.tensor(data[k][train_idx].values, dtype=torch.float32)
    X_val[:, i] = torch.tensor(data[k][val_idx].values, dtype=torch.float32)


for i, j in enumerate(train_idx):
    idx = label_idx[ data[label][j] ]
    y_train[i] = torch.tensor(idx)

for i, j in enumerate(val_idx):
    idx = label_idx[ data[label][j] ]
    y_val[i] = torch.tensor(idx)

print("Training:", X_train.shape, y_train.shape)
print("Evaluation:", X_val.shape, y_val.shape)

In [14]:
class NaiveBayes():
    """
    Not multinomial or Bernoulli since features are not finite but continuous
    """
    def __init__(self, X_train, y_train):
        assert len(X_train.shape) == 2 
        assert X_train.shape[0] == y_train.shape[0]
        self.X_train = X_train
        self.y_train = y_train
        self.nb_features = X_train.shape[1]
        print(f"Number of features: {self.nb_features}")

        ### Calculate class probability in training dataset
        self.class_probability = {}
        N = len(y_train)
        for i in range(N):
            c = int(y_train[i].item())
            if c in self.class_probability: # increase the count of class c by 1/total
                self.class_probability[c] += 1./N
            else: # create class c
                self.class_probability[c] = 1./N
        print("Class probabilities:", self.class_probability)

        ### Calculate feature mean and variance per class
        self.features_per_class = {}
        for c in self.class_probability.keys():
            self.features_per_class[c] = [] # create the classes
        for i in range(N):
            c = int(y_train[i].item()) # class
            self.features_per_class[c].append( X_train[i] )
        
        for c in self.features_per_class.keys():
            self.features_per_class[c] = torch.vstack(self.features_per_class[c])
            print(c, self.features_per_class[c].shape)

        self.mean_feature_per_class = {}
        self.variance_feature_per_class = {}
        for c in self.features_per_class.keys():
            self.mean_feature_per_class[c] = self.features_per_class[c].mean(dim=0)
            self.variance_feature_per_class[c] = self.features_per_class[c].std(dim=0)**2
            print(c, self.mean_feature_per_class[c], self.variance_feature_per_class[c])


    def _Gaussian(self, mean, variance, xi):
        """Returns the Gaussian conditional probability  P( x_i | C )
        Args: mean and variance of feature x_i in class C, xi value of feature x_i in the sample """
        return torch.exp(-(xi - mean)**2 / (2*variance)) / torch.sqrt(2*math.pi * variance)
    
    def _Gaussian_product(self, X, c):
        """Returns the product of the Gaussian probabilites for each feature xi of X knowing class c"""
        assert len(X.shape) == 1
        assert X.shape[0] == self.nb_features
        prod = self.class_probability[c]
        for i in range(X.shape[0]):
            mean = self.mean_feature_per_class[c][i]
            variance = self.variance_feature_per_class[c][i]
            prod *= self._Gaussian(mean, variance, X[i])
        return prod


    def predict(self, X:torch.Tensor):
        """X: dim (batch, number_of_features)"""
        assert len(X.shape) == 2 
        assert X.shape[1] == self.nb_features

        proba = []
        classes = torch.zeros(X.shape[0])
        for sample_id in range(X.shape[0]):
            proba.append( [] )
            for c in self.class_probability.keys():
                proba[sample_id].append(self._Gaussian_product(X[sample_id], c))
            proba[sample_id] = torch.hstack(proba[sample_id])
            classes[sample_id] = torch.argmax(proba[sample_id]) # Maximum A Posteriori
        print(proba)
        return classes

    
model = NaiveBayes(X_train=X_train, y_train=y_train)

Number of features: 4
Class probabilities: {0: 0.33333333333333354, 1: 0.33333333333333354, 2: 0.33333333333333354}
0 torch.Size([40, 4])
1 torch.Size([40, 4])
2 torch.Size([40, 4])
0 tensor([5.0375, 3.4400, 1.4625, 0.2325]) tensor([0.1311, 0.1327, 0.0296, 0.0099])
1 tensor([6.0100, 2.7800, 4.3175, 1.3500]) tensor([0.2737, 0.1109, 0.2035, 0.0431])
2 tensor([6.6225, 2.9600, 5.6075, 1.9900]) tensor([0.4679, 0.1132, 0.3453, 0.0743])


In [15]:
# Make predictions on the training set
predictions = model.predict(X_train)

# Calculate accuracy
accuracy = (predictions == y_train).float().mean()
print(f"Training Accuracy: {accuracy.item() * 100:.2f}%")

[tensor([3.2199e+00, 1.9678e-18, 1.6726e-23]), tensor([1.4864e+00, 7.8348e-18, 3.0087e-23]), tensor([1.1808e+00, 4.2286e-19, 3.1423e-24]), tensor([1.0775e+00, 6.3159e-18, 2.7884e-23]), tensor([2.9922e+00, 6.9194e-19, 7.0984e-24]), tensor([9.5790e-02, 1.2676e-15, 9.5040e-21]), tensor([1.3310e+00, 5.5624e-18, 4.0407e-23]), tensor([3.4167e+00, 1.0375e-17, 6.1412e-23]), tensor([2.3446e-01, 7.6173e-19, 3.6241e-24]), tensor([9.0727e-01, 1.5499e-18, 7.7728e-24]), tensor([1.6231e+00, 4.2186e-18, 4.3397e-23]), tensor([2.0610e+00, 1.7960e-17, 9.5237e-23]), tensor([5.6191e-01, 3.1635e-19, 1.7287e-24]), tensor([1.0150e-02, 2.3901e-22, 4.2850e-27]), tensor([3.6910e-02, 5.3501e-21, 2.4359e-25]), tensor([5.1775e-03, 3.0130e-19, 9.6920e-24]), tensor([1.5905e-01, 4.9912e-18, 8.1586e-23]), tensor([2.7003e+00, 2.5292e-17, 1.7420e-22]), tensor([1.3167e-01, 5.4059e-16, 4.5573e-21]), tensor([1.7523e+00, 9.8286e-18, 9.3317e-23]), tensor([8.2151e-01, 4.8948e-16, 2.1102e-21]), tensor([6.7887e-01, 2.4023e-16, 1

In [16]:
# Make predictions on the validation set
predictions = model.predict(X_val)

# Calculate accuracy
accuracy = (predictions == y_val).float().mean()
print(f"Validation Accuracy: {accuracy.item() * 100:.2f}%")

[tensor([1.8638e+00, 4.1446e-18, 3.6280e-23]), tensor([4.7186e-03, 1.5207e-18, 2.5977e-24]), tensor([3.8706e-01, 8.5371e-20, 8.3217e-25]), tensor([2.9869e-03, 1.4968e-13, 6.7654e-19]), tensor([2.1662e-02, 1.7169e-14, 7.8489e-20]), tensor([1.0804e+00, 6.5917e-17, 2.1455e-22]), tensor([1.5545e+00, 2.9786e-18, 2.9013e-23]), tensor([1.2852e+00, 1.1057e-18, 7.0733e-24]), tensor([2.0599e+00, 3.3148e-18, 3.3064e-23]), tensor([3.0606e+00, 4.2403e-18, 2.6000e-23]), tensor([0.0000e+00, 2.1064e-01, 6.0928e-05]), tensor([0.0000, 0.3274, 0.0037]), tensor([0.0000e+00, 2.4808e-01, 2.2261e-05]), tensor([6.2287e-40, 5.3887e-04, 1.2266e-09]), tensor([0.0000e+00, 3.4736e-01, 1.2799e-04]), tensor([0.0000e+00, 2.5991e-01, 7.7842e-05]), tensor([0.0000e+00, 3.8215e-01, 2.0901e-04]), tensor([0.0000, 0.4408, 0.0006]), tensor([2.1472e-35, 5.4523e-04, 1.9356e-09]), tensor([0.0000e+00, 3.7490e-01, 1.2436e-04]), tensor([0.0000e+00, 6.6577e-09, 6.7294e-02]), tensor([0.0000e+00, 4.8130e-07, 6.9768e-02]), tensor([0.0