In [62]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy.stats import norm

class GaussianNaiveBayes:
    def __init__(self):
        self.class_prior = {}
        self.mean = {}
        self.variance = {}

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.classes = np.unique(y)

        for c in self.classes:
            X_c = X[y == c]
            self.class_prior[c] = len(X_c) / n_samples
            self.mean[c] = X_c.mean(axis=0)
            self.variance[c] = X_c.var(axis=0)

    def _calculate_likelihood(self, X, c):
        class_mean = self.mean[c]
        class_var = self.variance[c]
        numerator = np.exp(-(X - class_mean) ** 2 / (2 * class_var))
        denominator = np.sqrt(2 * np.pi * class_var)
        return np.prod(numerator / denominator, axis=1)

    def _predict_single_sample(self, x):
        posteriors = {}
        for c in self.classes:
            likelihood = self._calculate_likelihood(x, c)
            posterior = likelihood * self.class_prior[c]
            posteriors[c] = posterior
        return max(posteriors, key=posteriors.get)

    def predict(self, X):
        y_pred = [self._predict_single_sample(x) for x in X]
        return np.array(y_pred)

data_set = pd.read_csv('./data/survey lung cancer.csv')
data_set = data_set.replace('x', np.nan).dropna()

for i in data_set.columns[2:-1]:
    data_set[i] = data_set[i].astype(int)

data_set['LUNG_CANCER'] = data_set['LUNG_CANCER'].map({'YES': 1, 'NO': 0})
data_set['LUNG_CANCER'] = data_set['LUNG_CANCER'].map({'M': 1, 'F': 0})

X = data_set.drop(columns=['LUNG_CANCER'])
y = data_set['LUNG_CANCER']

# Initialize Gaussian Naive Bayes classifier
nb_classifier = GaussianNaiveBayes()

# Train the classifier
nb_classifier.fit(X.to_numpy(), y.to_numpy())

# Predict on the test set
y_pred = nb_classifier.predict(X.to_numpy())

# Calculate accuracy
accuracy = np.mean(y_pred == y.to_numpy())
print("Accuracy:", accuracy)
# data_set.iloc[:, -1] = data_set.iloc[:, -1].map({'YES': 1, 'NO': 0})

# mean_values = data_set.iloc[:, 2:].mean()
# variance_values = data_set.iloc[:, 2:].var()

# gaussian_params = {}
# for column in data_set.columns[2:-1]:
#     mean, std = norm.fit(data_set[column])
#     gaussian_params[column] = {'mean': mean, 'std': std}



  self.mean[c] = X_c.mean(axis=0)


ZeroDivisionError: division by zero