In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
congressional_voting_records = fetch_ucirepo(id=105) 
  
# data (as pandas dataframes) 
X = congressional_voting_records.data.features 
y = congressional_voting_records.data.targets 
  
# metadata 
#print(congressional_voting_records.metadata) 
  
# variable information 
#print(congressional_voting_records.variables) 


In [2]:
print(X.shape)
print(y.shape)

(435, 16)
(435, 1)


In [3]:
import numpy as np
import pandas as pd

# features_count = X.shape[1]

# classes = np.unique(y)

# probs = pd.DataFrame(0.0, index=X.columns, columns=classes)
# #probs.assoc.loc[X.columns[0], classes[0]] = 42

# for index, row in X.iterrows():
#     for attr in X.columns:
#         if(row[attr] == "y"):
#             probs.loc[attr,y.loc[index,"Class"]] += 1.0

# #print(probs)

# aprior_log_prob = pd.Series(0.0, index=classes)

# for c in classes:
#     aprior_log_prob[c] = y[y["Class"] == c].shape[0]


# for index, row in probs.iterrows():
#     for c in probs.columns:
#         row[c] /= aprior_log_prob[c]

# aprior_log_prob /= X.shape[0]
# aprior_log_prob = np.log(aprior_log_prob)

# probs = np.log(probs)

# print(aprior_log_prob)
# print(probs)

class NaiveBayes:
    def __init__(self):
        self.probs = None
        self.aprior_log_probs = None
        self.classes = None 
        self.m_accuracy = np.array([])
        self.m_lambda = .5

    def fit(self, X, y):
        self.classes = y["Class"].unique()
        self.probs = pd.DataFrame(0.0, index=X.columns, columns=self.classes)
        self.aprior_log_probs = pd.Series(0.0, index=self.classes)

        for class_val in self.classes:
            class_rows = y[y["Class"] == class_val]
            self.aprior_log_probs[class_val] = len(class_rows)
                    
        for index, row in X.iterrows():
            for attr in X.columns:
                if(row[attr] == "y"):
                    self.probs.loc[attr,y.loc[index,"Class"]] += 1.0
        
        for c in self.classes:
            for attr in X.columns:
                    self.probs.loc[attr,c]  = ( self.probs.loc[attr,c] + self.m_lambda ) / (self.aprior_log_probs[c] + len(self.classes)*self.m_lambda)
        
        self.aprior_log_probs /= len(y)
        self.aprior_log_probs = np.log(self.aprior_log_probs)

        # print(self.probs)
        # print(self.aprior_log_probs)

    def accuracy(self, X, y):
        guessed = 0
        
        for index, row in X.iterrows():
            bestClass = None
            bestScore = -np.inf
            for c in self.classes:
                score = self.aprior_log_probs[c]
                for attr in X.columns:
                    if row[attr] == "y":
                        score += np.log(self.probs.loc[attr, c])
                    else:
                        score += np.log(1.0 - self.probs.loc[attr, c])
                if score > bestScore:
                    bestClass = c
                    bestScore = score
    
            if bestClass == y.loc[index,"Class"]:
                guessed += 1
                
        currentSampleAccuracy = guessed / X.shape[0]
        print(f"Accuracy: {currentSampleAccuracy}")
        self.m_accuracy = np.append(self.m_accuracy, currentSampleAccuracy)

    def getMeanAccuracy(self):
        return np.mean(self.m_accuracy)

    


In [5]:
from sklearn.model_selection import KFold

n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

accuracy = np.empty(0)
model = NaiveBayes()

for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    
    model.fit(X_train, y_train)
    model.accuracy(X_test, y_test)

print(model.getMeanAccuracy())

Accuracy: 0.8863636363636364
Accuracy: 0.9090909090909091
Accuracy: 0.9545454545454546
Accuracy: 0.9318181818181818
Accuracy: 0.8409090909090909
Accuracy: 0.9767441860465116
Accuracy: 0.8604651162790697
Accuracy: 0.8604651162790697
Accuracy: 0.9069767441860465
Accuracy: 0.8604651162790697
0.898784355179704
