In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split

class NaiveBayes:
    def __init__(self):
        self.classes = None
        self.mean = {}
        self.var = {}
        self.priors = {}

    def fit(self, x, y):
        self.classes = np.unique(y)
        n_samples = len(y)
        for c in self.classes:
            x_c = x[y == c]
            self.mean[c] = np.mean(x_c, axis=0)
            self.var[c] = np.var(x_c, axis=0)
            self.priors[c] = len(x_c) / n_samples

    def gaussian_prob(self, x, mean, var):
        e = -((x - mean) ** 2) / (2 * var)
        return (1 / np.sqrt(2 * np.pi * var)) * np.exp(e)

    def predict(self, x):
        predictions = []
        for i in x.values: 
            posteriors = []
            for c in self.classes:
                prior = np.log(self.priors[c])
                likelihood = np.sum(np.log(self.gaussian_prob(i, self.mean[c], self.var[c])))
                posterior = prior + likelihood
                posteriors.append(posterior)
            predictions.append(self.classes[np.argmax(posteriors)])
        return np.array(predictions)

# Load the dataset
df = pd.read_csv('Salaries.csv')

# Prepare input and output
input = df.drop('salary_more_than_100k', axis='columns')
output = df.salary_more_than_100k

# Encode categorical variables
le_company = LabelEncoder()
le_job = LabelEncoder()
le_degree = LabelEncoder()

input['company_n'] = le_company.fit_transform(input['company'])
input['job_n'] = le_job.fit_transform(input['job'])
input['degree_n'] = le_degree.fit_transform(input['degree'])

input = input.drop(['company', 'job', 'degree'], axis='columns')

input = input.apply(pd.to_numeric, errors='coerce')

if input.isnull().sum().any():
    print("NaN values found, filling with 0")
    input.fillna(0, inplace=True)  

x_train, x_test, y_train, y_test = train_test_split(input, output, test_size=0.2, random_state=42)

nb = NaiveBayes()
nb.fit(x_train, y_train)

for c in nb.classes:
    print(f"Class: {c}, Mean: {nb.mean[c]}, Variance: {nb.var[c]}")

y_pred = nb.predict(x_test)


Class: 0, Mean: company_n    0.50
job_n        1.00
degree_n     0.25
dtype: float64, Variance: company_n    0.7500
job_n        0.5000
degree_n     0.1875
dtype: float64
Class: 1, Mean: company_n    1.125
job_n        0.625
degree_n     0.625
dtype: float64, Variance: company_n    0.359375
job_n        0.734375
degree_n     0.234375
dtype: float64
