# Naive Bayes Classifier

In [39]:
import pandas as pd
from ucimlrepo import fetch_ucirepo

congressional_voting_records = fetch_ucirepo(id=105)

X: pd.core.frame.DataFrame = congressional_voting_records.data.features 
y: pd.core.frame.DataFrame = congressional_voting_records.data.targets 

X = X.fillna("?")


In [40]:
X

Unnamed: 0,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y
1,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
2,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
3,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y
4,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,n,n,y,y,y,y,n,n,y,y,n,y,y,y,n,y
431,n,n,y,n,n,n,y,y,y,y,n,n,n,n,n,y
432,n,?,n,y,y,y,n,n,n,n,y,y,y,y,n,y
433,n,n,n,y,y,y,?,?,?,?,n,y,y,y,n,y


In [41]:
y

Unnamed: 0,Class
0,republican
1,republican
2,democrat
3,democrat
4,democrat
...,...
430,republican
431,democrat
432,republican
433,republican


In [42]:
import math
from typing import Dict, List

In [43]:
class NaiveBayes:
    """Naive Bayes Classifier"""
    
    A: int = 3 # count of different values a feature could take
    LAMBDA: float = 1.0
    K: int = 2 # the number of classes


    def __init__(self, X: pd.core.frame.DataFrame, 
                       y: pd.core.frame.DataFrame) -> None:

        self.dem_distribution: Dict[str, List[int]] = {key: [0 for i in range(len(X.columns))] 
                                                       for key in ['y', 'n', '?']}
        self.rep_distribution: Dict[str, List[int]] = {key: [0 for i in range(len(X.columns))] 
                                                       for key in ['y', 'n', '?']}        
        self.dem_cnt: int = 0
        self.rep_cnt: int = 0

        for row_num, row in X.iterrows():
            match y.loc[row_num, "Class"]:
                case "republican":
                    for idx, col in enumerate(X.columns):
                        val = row[col] if row[col] in ['y', 'n'] else "?"
                        self.rep_distribution[val][idx] += 1
                    self.rep_cnt += 1
                case "democrat":
                    for idx, col in enumerate(X.columns):
                        val = row[col] if row[col] in ['y', 'n'] else "?"
                        self.dem_distribution[val][idx] += 1
                    self.dem_cnt += 1

# X.groupby([X.columns[0]])[X.columns[0]].transform("count").head()

    def prior_prob(self, class_type: str) -> float:
        """
        P(class == class_type)
        """
        denom: float = self.dem_cnt + self.rep_cnt + NaiveBayes.K * NaiveBayes.LAMBDA
        return self.dem_cnt / denom if class_type == 'democrat' else self.rep_cnt / denom

    def get_prob(self, 
                 feature_idx: int, 
                 feature_val: str, 
                 class_val: str) -> float:
        """
        P(feature@feature_id == feature_val | class == class_val)
        """
        match class_val:
            case 'democrat':
                return (self.dem_distribution[feature_val][feature_idx] + NaiveBayes.LAMBDA) / \
                       (self.dem_cnt + NaiveBayes.A * NaiveBayes.LAMBDA) 
            case 'republican':
                return (self.rep_distribution[feature_val][feature_idx] + NaiveBayes.LAMBDA) / \
                       (self.rep_cnt + NaiveBayes.A * NaiveBayes.LAMBDA)
            

    def predict(self, X: pd.core.series.Series) -> str:
        prob_dem: float = sum(math.log(self.get_prob(f_id, f_val, "democrat")) 
                              for f_id, f_val in enumerate(X)) + math.log(self.prior_prob("democrat"))
        prob_rep: float = sum(math.log(self.get_prob(f_id, f_val, "republican"))
                              for f_id, f_val in enumerate(X)) + math.log(self.prior_prob("republican"))
        
        return "democrat" if prob_dem > prob_rep else "republican"


## Measuring Accuracy

In [44]:
temp: NaiveBayes = NaiveBayes(X, y)
mistakes: int = 0
for i in range(0, len(X)):
    if temp.predict(X.iloc[i]) != y.iloc[i]["Class"]:
        mistakes += 1

print(f"Accuracy: {1 - mistakes / len(X)}")

Accuracy: 0.903448275862069


* _**10-fold cross validation**_

In [53]:
import time

# shuffle data
def cross_val_accuracy(X, y, k) -> float:
    random_seed = int(time.time())
    X = X.sample(frac=1, random_state=random_seed).reset_index(drop=True)
    y = y.sample(frac=1, random_state=random_seed).reset_index(drop=True)

    chunks = k
    chunk_size: int = len(X) // chunks

    acc: List[float] = []
    for begin in range(0, len(X), chunk_size):
        end = begin + chunk_size
    
        X_train = pd.concat([X.iloc[:begin], X.iloc[end:]])
        y_train = pd.concat([y.iloc[:begin], y.iloc[end:]])
        

        X_test = X.iloc[begin:end]
        y_test = y.iloc[begin:end]
    
        model: NaiveBayes = NaiveBayes(X_train, y_train)
        mistakes: int = 0

        for row_num, row in X_test.iterrows():
            y_val = y_test.loc[row_num, "Class"]
            pr_val = model.predict(row)
            if y_val != pr_val:
                mistakes += 1
        acc.append(1 - mistakes / len(X_test))

    return sum(acc) / len(acc)

accs = [cross_val_accuracy(X, y, 10) for _ in range(1, 10)]

print(f"{sum(accs)/len(accs):.2f}")

[0.8887949260042284, 0.8887949260042284, 0.9090909090909091, 0.9090909090909091, 0.9069767441860465, 0.9069767441860465, 0.9069767441860465, 0.8930232558139536, 0.8930232558139536]
0.90


In [None]:
import numpy as np
from sklearn.naive_bayes import CategoricalNB 
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import  accuracy_score

encoder = OrdinalEncoder()
nb_classifier = CategoricalNB()

X_encoded = pd.DataFrame(encoder.fit_transform(X), columns=X.columns)
y_encoded = pd.DataFrame(encoder.fit_transform(y), columns=y.columns)

accs = [np.mean(cross_validate(nb_classifier, X_encoded,
                               y_encoded.values.ravel(), cv=10)['test_score']) for _ in range(1, 10)]

print(f"{sum(accs) / len(accs):.2f}")

0.90
