In [1]:
from csv import reader
import matplotlib.pyplot as plt
import random
import math

# Naive Bayes Classifier
## Ivan Makaveev, 2MI0600203

In [2]:
file_path = "./house-votes-84.data"
random.seed(1337)

laplace_smoothing = 1
use_gauss = True
preprocess = False

In [3]:
def load_dataset(path):
    dataset = []
    with open(path, 'r') as file:
        csv = reader(file)
        for row in csv:
            if row:
                dataset.append(row)
    return dataset

In [4]:
def get_mode(dataset, label, attr_idx):
    data = [record[attr_idx] for record in filter(lambda rec: rec[0] == label , dataset)]
    return max(data, key = data.count)

def fill_missing(dataset, missing = "?"):
    return [[attribute if attribute != missing else get_mode(dataset, record[0] , idx) for idx, attribute in enumerate(record)] for record in dataset]

In [5]:
def transform_continous(dataset, value_map):
    return [[value_map[attribute] if attribute in value_map.keys() else attribute for attribute in record] for record in dataset]

In [6]:
dataset = load_dataset(file_path)

if(use_gauss):
    dataset = transform_continous(dataset, {"y": 2, "?": 1, "n": 0})
elif(preprocess):
    dataset = fill_missing(dataset)

In [7]:
class NaiveBayesClassifier:
    def __init__(self, smoothing):
        if smoothing < 0:
            raise ValueError("Smoothing factor must be >= 0")
            
        self.classes = set()
        self.class_prob = dict()
        self.feature_conditional_prob = dict()
        self.smoothing = smoothing

    def extract_classes(self, data_train):
        return set([record[0] for record in data_train])

    def extract_features(self, data_train):
        return [set([record[featureId] for record in data_train]) for featureId in range(1, len(data_train[0]))]
    
    def train(self, data_train):
        self.classes = self.extract_classes(data_train)
        self.class_prob = dict()
        self.feature_conditional_prob = dict()
        
        data_size = len(data_train)
        cols_count = len(data_train[0]) - 1
        feature_values = self.extract_features(data_train)
        
        for class_name in self.classes:
            class_records_count = len([record for record in data_train if record[0] == class_name])
            self.class_prob[class_name] = (class_records_count + self.smoothing) / (data_size + self.smoothing * len(self.classes))
            
            self.feature_conditional_prob[class_name] = list()
            for feature_id in range(0, cols_count):
                value_conditional_prob = dict()
                for value in feature_values[feature_id]:
                    value_records_count = len([record for record in data_train if record[0] == class_name and record[feature_id + 1] == value])
                    value_conditional_prob[value] =  (value_records_count + self.smoothing) / (class_records_count + self.smoothing * len(feature_values[feature_id]))
                self.feature_conditional_prob[class_name].append(value_conditional_prob)
                
    def predict(self, instance):
        instance_probs = dict()
        features = len(instance)
        
        probs = dict()
        for class_name in self.classes:
            class_conditional_prob = math.log(self.class_prob[class_name])
            for feature_id in range(0, features):
                class_conditional_prob += math.log(self.feature_conditional_prob[class_name][feature_id][instance[feature_id]])
            probs[class_name] = class_conditional_prob
        
        return max(probs, key=probs.get)

In [8]:
def evaluate_mean(value_list):
    return sum(value_list) / len(value_list)

def evaluate_sd(value_list):
    mean = evaluate_mean(value_list)
    sd = 0
    for val in value_list:
        sd += (val - mean) * (val - mean)
    sd /= len(value_list)
    
    return math.sqrt(sd)

def gauss_probability(value, mean, sd):
    return (1 / math.sqrt(2 * sd * sd * math.pi)) * math.pow(math.e, -(value - mean)*(value - mean)/(2*sd*sd))

def log_gauss_probability(value, mean, sd):
    return -math.log(sd) -0.5 * math.log(2 * math.pi) - (value - mean) * (value - mean) / (2 * sd * sd)

class NaiveBayesClassifier_Gaussian:
    def __init__(self, smoothing):
        if smoothing < 0:
            raise ValueError("Smoothing factor must be >= 0")
            
        self.classes = set()
        self.class_prob = dict()
        self.feature_conditional_prob = dict()
        self.smoothing = smoothing

    def extract_classes(self, data_train):
        return set([record[0] for record in data_train])
    
    def train(self, data_train):
        self.classes = self.extract_classes(data_train)
        self.class_prob = dict()
        self.feature_conditional_prob = dict()
        
        data_size = len(data_train)
        cols_count = len(data_train[0]) - 1
        
        for class_name in self.classes:
            class_records_count = len([record for record in data_train if record[0] == class_name])
            self.class_prob[class_name] = (class_records_count + self.smoothing) / (data_size + self.smoothing * len(self.classes))
            
            self.feature_conditional_prob[class_name] = list()
            for feature_id in range(0, cols_count):
                feature_values = [record[feature_id + 1] for record in data_train if record[0] == class_name]
                feature_mean = evaluate_mean(feature_values)
                feature_sd = evaluate_sd(feature_values)
                self.feature_conditional_prob[class_name].append((feature_mean, max(feature_sd, 1e-9)))
                
    def predict(self, instance):
        instance_probs = dict()
        features = len(instance)
        
        probs = dict()
        for class_name in self.classes:
            class_conditional_prob = math.log(self.class_prob[class_name])
            for feature_id in range(0, features):
                feature_mean, feature_sd = self.feature_conditional_prob[class_name][feature_id]
                class_conditional_prob += log_gauss_probability(instance[feature_id], feature_mean, feature_sd)
            probs[class_name] = class_conditional_prob
        
        return max(probs, key=probs.get)

In [9]:
def train_test_split(dataset, test_percentage):
    if(test_percentage > 100 or test_percentage < 0):
        raise ValueError("Test percentage must be between 0 and 100")
    
    random.shuffle(dataset)
    classes = set([record[0] for record in dataset])
    group_dataset = [[record for record in dataset if record[0] == class_name] for class_name in classes]
    
    train = list()
    test = list()
    for group in group_dataset:
        test_count = len(group) // test_percentage
        test.extend(group[0:test_count])
        train.extend(group[test_count:])
        
    return train, test

In [10]:
def evalute_accuracy(model, test_instances, test_labels):
    correct = 0
    for idx, instance in enumerate(test_instances):
        correct += model.predict(instance) == test_labels[idx]
    
    return correct * 100 / len(test_instances)

In [11]:
def evalute_test_performance(model, data_test):
    return evalute_accuracy(model, [record[1:] for record in data_test], [record[0] for record in data_test])

In [12]:
def evalute_train_performance(model, data_train, folds_count = 10):
    folds = list()
    fold_size = len(data_train) // folds_count
    for idx in range(0, folds_count - 1):
        fold_test = data_train[fold_size * idx : fold_size * (idx + 1)]
        fold_train = data_train[0:fold_size * idx] + data_train[fold_size * (idx + 1):]
        folds.append((fold_train, fold_test))
    
    last_fold = (data_train[0:fold_size * (folds_count - 1)], data_train[fold_size * (folds_count-1):])
    folds.append(last_fold)
    
    accuracies = list()
    for fold in folds:
        model.train(fold[0])
        accuracies.append(evalute_test_performance(model, fold[1]))
    
    model.train(data_train)
    return accuracies, evalute_accuracy(model, [record[1:] for record in data_train], [record[0] for record in data_train])

In [13]:
train, test = train_test_split(dataset, 20)

In [14]:
model = NaiveBayesClassifier(1)
if(use_gauss):
    model = NaiveBayesClassifier_Gaussian(1)

cross_val_accuracies, train_result = evalute_train_performance(model, train)

print(f"Train set accuracy: {train_result:.2f}%")
print()

print("10-fold Cross Validation:")
for idx, accuracy in enumerate(cross_val_accuracies):
    print(f"Fold {idx + 1}: {accuracy:.2f}%")

print(f"Mean: {evaluate_mean(cross_val_accuracies):.2f}%")
print(f"SD: {evaluate_sd(cross_val_accuracies):.2f}")

Train set accuracy: 94.44%

10-fold Cross Validation:
Fold 1: 92.68%
Fold 2: 97.56%
Fold 3: 92.68%
Fold 4: 92.68%
Fold 5: 95.12%
Fold 6: 97.56%
Fold 7: 92.68%
Fold 8: 90.24%
Fold 9: 90.24%
Fold 10: 97.78%
Mean: 93.92%
SD: 2.76


In [15]:
test_result = evalute_test_performance(model, test)
print(f"Test set accuracy: {test_result:.2f}%")

Test set accuracy: 95.24%
