In [365]:
from csv import reader
from random import seed
from random import randrange
from math import sqrt
from math import exp
from math import pi
import copy

In [366]:
seed(1)

In [367]:
# Load a CSV file
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

In [368]:
# Convert string column to float
def str_column_to_float(input_dataset, column):
    dataset = copy.deepcopy(input_dataset)
    for row in dataset:
        row[column] = float(row[column].strip())

    return dataset

In [369]:
# Convert string column to float
def int_column_to_float(input_dataset, column):
    dataset = copy.deepcopy(input_dataset)
    for row in dataset:
        row[column] = float(row[column])

    return dataset

In [370]:
# Convert string column to integer
def str_column_to_int(input_dataset, column):
    dataset = copy.deepcopy(input_dataset)
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    
    for row in dataset:
        row[column] = lookup[row[column]]
    
    return dataset

In [371]:
# Split the dataset by class values, returns a dictionary
def separate_dataset_by_class(dataset, seperation_column):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[seperation_column]
        if class_value not in separated:
            separated[class_value] = list()
        
        separated[class_value].append(vector)
    return separated

In [372]:
# Calculate the mean of a list of numbers
def mean(numbers):
    return sum(numbers) / float(len(numbers))

In [373]:
# Calculate the standard deviation of a list of numbers
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([(x - avg)**2 for x in numbers]) / float(len(numbers) - 1)
    return sqrt(variance)

In [374]:
# Calculate the mean, stdev and count for each column in a dataset
def summarize_dataset(dataset, target_column = -1):
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
    del(summaries[target_column])
    return summaries

In [375]:
# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset, target_column = -1):
    separated = separate_dataset_by_class(dataset, target_column)
    summaries = dict()
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows)
        
    return summaries

In [376]:
# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
    exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent

In [377]:
# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2] / float(total_rows)
        for i in range(len(class_summaries)):
            mean, stdev, _ = class_summaries[i]
            probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
    return probabilities

In [378]:
# Predict the class for a given row
def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    
    return best_label

In [379]:
# Naive Bayes Algorithm
def naive_bayes(train, test, target_column):
    summarize = summarize_by_class(train, target_column)
    predictions = list()
    for row in test:
        output = predict(summarize, row)
        predictions.append(output)
    
    return(predictions)

In [380]:
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        
        dataset_split.append(fold)

    return dataset_split

In [381]:
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    
    return correct / float(len(actual)) * 100.0

In [382]:
# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, target_column = -1, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[target_column] = None
            
        predicted = algorithm(train_set, test_set, target_column, *args)
        actual = [row[target_column] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
        
    return scores

In [383]:
# Load iris data
data = load_csv('data/iris.csv')
data

[['5.1', '3.5', '1.4', '0.2', 'Iris-setosa'],
 ['4.9', '3.0', '1.4', '0.2', 'Iris-setosa'],
 ['4.7', '3.2', '1.3', '0.2', 'Iris-setosa'],
 ['4.6', '3.1', '1.5', '0.2', 'Iris-setosa'],
 ['5.0', '3.6', '1.4', '0.2', 'Iris-setosa'],
 ['5.4', '3.9', '1.7', '0.4', 'Iris-setosa'],
 ['4.6', '3.4', '1.4', '0.3', 'Iris-setosa'],
 ['5.0', '3.4', '1.5', '0.2', 'Iris-setosa'],
 ['4.4', '2.9', '1.4', '0.2', 'Iris-setosa'],
 ['4.9', '3.1', '1.5', '0.1', 'Iris-setosa'],
 ['5.4', '3.7', '1.5', '0.2', 'Iris-setosa'],
 ['4.8', '3.4', '1.6', '0.2', 'Iris-setosa'],
 ['4.8', '3.0', '1.4', '0.1', 'Iris-setosa'],
 ['4.3', '3.0', '1.1', '0.1', 'Iris-setosa'],
 ['5.8', '4.0', '1.2', '0.2', 'Iris-setosa'],
 ['5.7', '4.4', '1.5', '0.4', 'Iris-setosa'],
 ['5.4', '3.9', '1.3', '0.4', 'Iris-setosa'],
 ['5.1', '3.5', '1.4', '0.3', 'Iris-setosa'],
 ['5.7', '3.8', '1.7', '0.3', 'Iris-setosa'],
 ['5.1', '3.8', '1.5', '0.3', 'Iris-setosa'],
 ['5.4', '3.4', '1.7', '0.2', 'Iris-setosa'],
 ['5.1', '3.7', '1.5', '0.4', 'Iri

In [384]:
# Prepeare data
attributes = str_column_to_int(data, len(data[0]) - 1)
for i in range(0, len(data[0]) - 1):
    attributes = str_column_to_float(attributes, i)

print(attributes[:5])

[[5.1, 3.5, 1.4, 0.2, 0], [4.9, 3.0, 1.4, 0.2, 0], [4.7, 3.2, 1.3, 0.2, 0], [4.6, 3.1, 1.5, 0.2, 0], [5.0, 3.6, 1.4, 0.2, 0]]


In [385]:
# Train model and evaluate the model
n_folds = 10
scores = evaluate_algorithm(attributes, naive_bayes, n_folds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Scores: [86.66666666666667, 100.0, 93.33333333333333, 100.0, 100.0, 100.0, 100.0, 86.66666666666667, 86.66666666666667, 100.0]
Mean Accuracy: 95.333%


In [386]:
predicted = naive_bayes(attributes, [[5.0, 3.6, 1.4, 0.2]], target_column=-1)
print(predicted)

[0]


In [387]:
votes_data = load_csv('data/house-votes-84.data')
# Move the first column to the last
votes_data = [row[1:] + [row[0]] for row in votes_data]
votes_data

[['n',
  'y',
  'n',
  'y',
  'y',
  'y',
  'n',
  'n',
  'n',
  'y',
  '?',
  'y',
  'y',
  'y',
  'n',
  'y',
  'republican'],
 ['n',
  'y',
  'n',
  'y',
  'y',
  'y',
  'n',
  'n',
  'n',
  'n',
  'n',
  'y',
  'y',
  'y',
  'n',
  '?',
  'republican'],
 ['?',
  'y',
  'y',
  '?',
  'y',
  'y',
  'n',
  'n',
  'n',
  'n',
  'y',
  'n',
  'y',
  'y',
  'n',
  'n',
  'democrat'],
 ['n',
  'y',
  'y',
  'n',
  '?',
  'y',
  'n',
  'n',
  'n',
  'n',
  'y',
  'n',
  'y',
  'n',
  'n',
  'y',
  'democrat'],
 ['y',
  'y',
  'y',
  'n',
  'y',
  'y',
  'n',
  'n',
  'n',
  'n',
  'y',
  '?',
  'y',
  'y',
  'y',
  'y',
  'democrat'],
 ['n',
  'y',
  'y',
  'n',
  'y',
  'y',
  'n',
  'n',
  'n',
  'n',
  'n',
  'n',
  'y',
  'y',
  'y',
  'y',
  'democrat'],
 ['n',
  'y',
  'n',
  'y',
  'y',
  'y',
  'n',
  'n',
  'n',
  'n',
  'n',
  'n',
  '?',
  'y',
  'y',
  'y',
  'democrat'],
 ['n',
  'y',
  'n',
  'y',
  'y',
  'y',
  'n',
  'n',
  'n',
  'n',
  'n',
  'n',
  'y',
  'y',
  '?',
  

In [388]:
votes_data_scaled = copy.deepcopy(votes_data)
for i in range(len(votes_data[0])):
    votes_data_scaled = str_column_to_int(votes_data_scaled, i)

for i in range(len(votes_data[0]) - 1):
    votes_data_scaled = int_column_to_float(votes_data_scaled, i)
    
votes_data_scaled

[[0.0,
  1.0,
  0.0,
  1.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  2.0,
  1.0,
  1.0,
  1.0,
  0.0,
  1.0,
  1],
 [0.0,
  1.0,
  0.0,
  1.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  1.0,
  0.0,
  2.0,
  1],
 [2.0,
  1.0,
  1.0,
  2.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0],
 [0.0,
  1.0,
  1.0,
  0.0,
  2.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  0.0,
  1.0,
  0],
 [1.0,
  1.0,
  1.0,
  0.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  2.0,
  1.0,
  1.0,
  1.0,
  1.0,
  0],
 [0.0,
  1.0,
  1.0,
  0.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  1.0,
  1.0,
  0],
 [0.0,
  1.0,
  0.0,
  1.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  1.0,
  1.0,
  1.0,
  0],
 [0.0,
  1.0,
  0.0,
  1.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  2.0,
  1.0,
  1],
 [0.0,
  1.0,
  0.0,
  1.0,
  1.0,
  1.0,
  0.0,
  0.0,


In [389]:
scores = evaluate_algorithm(votes_data_scaled, naive_bayes, n_folds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Scores: [97.67441860465115, 97.67441860465115, 90.69767441860465, 90.69767441860465, 95.34883720930233, 97.67441860465115, 93.02325581395348, 86.04651162790698, 93.02325581395348, 100.0]
Mean Accuracy: 94.186%
