# Naive Baye from Scratch

In [17]:
import re
import string
from math import log
from functools import reduce
from pathlib import Path
import itertools
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [2]:

column_names = [
    "age", "workclass", "fnlwgt", "education", "education_num", "marital_status",
    "occupation", "relationship", "race", "sex", "capital_gain", "capital_loss",
    "hours_per_week", "native_country", "income"
]
data = pd.read_csv("adult/adult.data", names=column_names, na_values=" ?")
data.dropna(inplace=True)

with open("adult/adult.test", "r") as test_file:
    test_data = [line.strip() for line in test_file.readlines()]
test_df = pd.DataFrame([line.split(", ") for line in test_data], columns=column_names)
test_df['income'] = test_df['income'].str.replace('.', '')


In [3]:
categorical = [var for var in data.columns if data[var].dtype=='O']

print('There are {} categorical variables\n'.format(len(categorical)))

print('The categorical variables are :\n\n', categorical)

There are 9 categorical variables

The categorical variables are :

 ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'income']


In [4]:
data[categorical].isnull().sum()

workclass         0
education         0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
native_country    0
income            0
dtype: int64

In [5]:
X = data.drop(['income'], axis=1)

y = data['income']

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [7]:
import category_encoders as ce

In [8]:
encoder = ce.OneHotEncoder(cols=['workclass', 'education', 'marital_status', 'occupation', 'relationship', 
                                 'race', 'sex', 'native_country'])
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

In [9]:
print(X_train)

       age  workclass_1  workclass_2  workclass_3  workclass_4  workclass_5  \
30461   34            1            0            0            0            0   
18186   51            1            0            0            0            0   
24974   50            1            0            0            0            0   
25659   52            0            1            0            0            0   
13876   31            1            0            0            0            0   
...    ...          ...          ...          ...          ...          ...   
14169   66            0            0            1            0            0   
21206   50            1            0            0            0            0   
10646   50            1            0            0            0            0   
11687   49            0            1            0            0            0   
2976    34            1            0            0            0            0   

       workclass_6  workclass_7  fnlwgt  education_

In [10]:
import numpy as np

# Calculate prior probabilities for each class
def calc_prior(y):
    class_counts = y.value_counts()
    prior_prob = class_counts / len(y)
    return prior_prob



In [11]:
# Calculate likelihood using Laplace smoothing
def calc_likelihood(X, y, feature, value, alpha=1):
    total_samples = len(y)
    class_counts = y.value_counts()
    feature_counts = X[y == value][feature].sum()
    class_feature_counts = X[y == value][feature].sum()
    likelihood = (feature_counts + alpha) / (class_feature_counts + alpha * 2)
    
    return likelihood


In [12]:

# Calculate posterior probabilities for each class
def calc_posterior(X, y, prior_prob, likelihoods):
    unique_classes = y.unique()
    posteriors = {}
    
    for class_label in unique_classes:
        posterior = np.log(prior_prob[class_label])
        
        for feature in X.columns:
            posterior += np.log(likelihoods[feature][class_label])
        
        posteriors[class_label] = posterior
    
    return posteriors


In [13]:
def predict(X, y, prior_prob, likelihoods):
    predicted_labels = []
    
    for idx, row in X.iterrows():
        posteriors = calc_posterior(X, y, prior_prob, likelihoods)
        predicted_label = max(posteriors, key=posteriors.get)
        predicted_labels.append(predicted_label)
    
    return predicted_labels

In [14]:
# Calculate accuracy score
def accuracy(y_true, y_pred):
    return accuracy_score(y_true, y_pred)

In [25]:
# Calculate prior probabilities and likelihoods
prior_prob = calc_prior(y_train)
likelihoods = {}

for feature in X_train.columns:
    likelihoods[feature] = {}
    for class_label in y_train.unique():
        likelihoods[feature][class_label] = calc_likelihood(X_train, y_train, feature, 1)

# Make predictions
y_pred = predict(X_test, y_train, prior_prob, likelihoods)

accuracy_value = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label=' <=50K',zero_division=1)
recall = recall_score(y_test, y_pred, pos_label=' <=50K',zero_division=1)
confusion_matrix_result = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy_value)
print("Precision:", precision)
print("Recall:", recall)
print("Confusion Matrix:\n", confusion_matrix_result)


Accuracy: 0.7512017238521466
Precision: 0.7512017238521466
Recall: 1.0
Confusion Matrix:
 [[4532    0]
 [1501    0]]
