# Naive Bayes Classifier

In [230]:
import pandas as pd
from ucimlrepo import fetch_ucirepo

congressional_voting_records = fetch_ucirepo(id=105)

X: pd.core.frame.DataFrame = congressional_voting_records.data.features 
y: pd.core.frame.DataFrame = congressional_voting_records.data.targets 


In [231]:
X

Unnamed: 0,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,n,y,n,y,y,y,n,n,n,y,,y,y,y,n,y
1,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,
2,,y,y,,y,y,n,n,n,n,y,n,y,y,n,n
3,n,y,y,n,,y,n,n,n,n,y,n,y,n,n,y
4,y,y,y,n,y,y,n,n,n,n,y,,y,y,y,y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,n,n,y,y,y,y,n,n,y,y,n,y,y,y,n,y
431,n,n,y,n,n,n,y,y,y,y,n,n,n,n,n,y
432,n,,n,y,y,y,n,n,n,n,y,y,y,y,n,y
433,n,n,n,y,y,y,,,,,n,y,y,y,n,y


In [232]:
y

Unnamed: 0,Class
0,republican
1,republican
2,democrat
3,democrat
4,democrat
...,...
430,republican
431,democrat
432,republican
433,republican


In [233]:
import math
from typing import Dict, List

In [241]:
class NaiveBayes:
    """Bernoulli 
       Naive Bayes Classifier"""
    
    A: int = 2 # count of different values a feature could take
    LAMBDA: float = 1
    K: int = 2 # the number of classes


    def __init__(self, X: pd.core.frame.DataFrame, 
                       y: pd.core.frame.DataFrame) -> None:

        self.dem_distribution = {key: [0 for i in range(len(X.columns))] 
                                 for key in ['y', 'n', '?']}
        self.rep_distribution = {key: [0 for i in range(len(X.columns))] 
                                 for key in ['y', 'n', '?']}        
        self.dem_cnt = 0
        self.rep_cnt = 0

        for row_num, row in X.iterrows():
            match y.loc[row_num, "Class"]:
                case "republican":
                    for idx, col in enumerate(X.columns):
                        val = row[col] if row[col] in ['y', 'n'] else "?"
                        self.rep_distribution[val][idx] += 1
                    self.rep_cnt += 1
                case "democrat":
                    for idx, col in enumerate(X.columns):
                        val = row[col] if row[col] in ['y', 'n'] else "?"
                        self.dem_distribution[val][idx] += 1
                    self.dem_cnt += 1

# X.groupby([X.columns[0]])[X.columns[0]].transform("count").head()
    def get_prob(self, 
                 feature_idx: int, 
                 feature_val: str, 
                 class_val: str) -> float:
        a_new: int = NaiveBayes.A + 1 if feature_val == "?" else NaiveBayes.A
        feature_val = "?" if feature_val not in ["y", "n"] else feature_val

        match class_val:
            case 'democrat':
                return (self.dem_distribution[feature_val][feature_idx] + NaiveBayes.LAMBDA) / \
                       (self.dem_cnt + a_new * NaiveBayes.LAMBDA) 
            case 'republican':
                return (self.rep_distribution[feature_val][feature_idx] + NaiveBayes.LAMBDA) / \
                       (self.rep_cnt + a_new * NaiveBayes.LAMBDA)
            

    def predict(self, X: pd.core.series.Series) -> str:
        prob_dem: float = sum(math.log(self.get_prob(f_id, f_val, "democrat")) 
                              for f_id, f_val in enumerate(X))
        prob_rep: float = sum(math.log(self.get_prob(f_id, f_val, "republican"))
                              for f_id, f_val in enumerate(X))
        
        return "democrat" if prob_dem > prob_rep else "republican"


## Measuring Accuracy

In [257]:
temp: NaiveBayes = NaiveBayes(X, y)
mistakes: int = 0
for i in range(0, len(X)):
    if temp.predict(X.iloc[i]) != y.iloc[i]["Class"]:
        mistakes += 1

print(f"Accuracy: {1 - mistakes / len(X)}")

Accuracy: 0.9103448275862069


* ***10-fold cross validation***

In [259]:
X = X.sample(frac=1, random_state=42).reset_index(drop=True)
y = y.sample(frac=1, random_state=42).reset_index(drop=True)

chunks = 10
chunk_size: int = len(X) // chunks

acc: List[float] = []
for begin in range(0, len(X), chunk_size):
    end = begin + chunk_size
   
    X_train = pd.concat([X.iloc[:begin], X.iloc[end:]])
    y_train = pd.concat([y.iloc[:begin], y.iloc[end:]])
    

    X_test = X.iloc[begin:end]
    y_test = y.iloc[begin:end]
   
    model: NaiveBayes = NaiveBayes(X_train, y_train)
    mistakes: int = 0

    for row_num, row in X_test.iterrows():
        y_val = y_test.loc[row_num, "Class"]
        pr_val = model.predict(row)
        if y_val != pr_val:
            mistakes += 1
    
    acc.append(1 - mistakes / len(X_test))

sum(acc) / len(acc)

0.9112050739957717