In [15]:
# -*- coding: utf-8 -*-
"""
Sample code for bayesian approach
"""

import csv
import random
import math

filename = 'sampleDataset.csv'
lines = csv.reader(open(filename, "r")) # read the csv file ('open' - see https://docs.python.org/3/library/functions.html)
dataset = list(lines)

for i in range(len(dataset)): # this two lines are equivalent to the following "for" statement
    dataset[i] = [float(x) for x in dataset[i]] # convert to float type
#for i in range(len(dataset)): # convert the data type in dataset to float type
#    for j in range(len(dataset[i])):
#        dataset[i][j] = float(dataset[i][j])

# prepare training dataset and testing dataset
splitRatio = 0.67 # 2/3 for training and 1/3 for testing
trainSize = int(len(dataset) * splitRatio)
trainSet = []
copy = list(dataset)
while len(trainSet) < trainSize:
    index = random.randint(0, len(copy)-1) # randomly pick one number between 0 and dataset size
    trainSet.append(copy.pop(index))
    # pop: Remove the item at the given position in the list
    # append: Add an item to the end of the list
#    print(index) # un-comment to see the indexes selected
testSet = copy
print('Split %d rows into train=%d and test=%d rows' % (len(dataset), len(trainSet), len(testSet)))
    
# prepare model
#    separated by class
separated = {} # create an empty dictionary
for i in range(len(trainSet)):
    vector = trainSet[i] # Note that the last element in dataset is the 'label'
    # create a key and add a blank array to it
    if (vector[-1] not in separated): # vector[-1] refers to the last element (count from the right)
        separated[vector[-1]] = []
    separated[vector[-1]].append(vector) # store separately datasets corresponding to their accessing keys (e.g., labels, 0.0 and 1.0) -- e.g., {1: [[6.0, 119.0, 50.0, 22.0, 176.0, 27.1, 1.318, 33.0, 1.0], [6.0, 119.0, 50.0, 22.0, 176.0, 27.1, 1.318, 33.0, 1.0]], 2: []} 

# mean and std summarized by class
def mean(numbers):
    return sum(numbers)/float(len(numbers))
 
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1) # pow: Return x to the power y
    return math.sqrt(variance)

summaries = {}
for classValue, instances in separated.items(): # learn the distribution parameters with the TrainSet
    summaries[classValue] = [(mean(attribute), stdev(attribute)) for attribute in zip(*instances)] # calculate separatedly the mean and std for each attribute/feature (e.g., labels, 0.0 and 1.0)
    """ zip() - built-in function
    >>> x = [1, 2, 3]
    >>> y = [4, 5, 6]
    >>> zipped = list(zip(x, y))
    >>> zipped
    [(1, 4), (2, 5), (3, 6)]
    >>> x2, y2 = zip(*zipped)
    >>> x == list(x2) and y == list(y2)
    True
    """
    del summaries[classValue][-1] # delete the label attribute
# print(summaries) # to see what is stored

# calculate the probability that each label occurs (e.g., P(class) such as P(0) & P(1))
probability_label_0 = len(separated[0]) / len(trainSet)
probability_label_1 = len(separated[1]) / len(trainSet)
probability_label= [probability_label_0, probability_label_1]

# test model
def calculateProbability(x, mean, stdev): # to calculate P(x | class), the probability for x (the dataset which we are predicting the label for)
	exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
	return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent
 
# to predict the label with datasets (testSet)
# testSet = trainSet # un-comment to switch the datasets when calculting the accuracy for trainSet
predictions = []
for h in range(len(testSet)):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1 * probability_label[int(classValue)] # initialization
       
        for i in range(len(classSummaries)): # len(classSummaries) = the number of features
            mean, stdev = classSummaries[i]            
            x = testSet[h][i] # testSet[0] = the first datapoint & testSet[0][0] = value of the firt feature of the first datapoint
            probabilities[classValue] *= calculateProbability(x, mean, stdev) # to calculate P(class | x) by P(class)* P(x1 | class) * P(x2 | class) ...
    # predict the label: Decision Rule
    if probabilities[0] > probabilities[1]:
        predictions.append(0)
    else:
        predictions.append(1)  
 
# calculate the accuracy
correct = 0
for i in range(len(testSet)):
    if testSet[i][-1] == predictions[i]:
        correct += 1
accuracy = (correct/float(len(testSet))) * 100.0
print('Accuracy for testing dataset: %f' % accuracy)


""" Question: What are the labels you predict using the trained Bayesian model for the new datasets below?
[10, 139, 80, 0, 0, 27.1, 1.441, 57]
[10, 115, 70, 10, 100, 40.1, 0.421, 30]
[5, 196, 65, 30, 55, 37.1, 0.247, 41]
[7, 97, 75, 31, 213, 24.5, 0.845, 37]
[6, 109, 76, 0, 52, 43.2, 2.041, 33]
"""
#########################################################################

New_prediction=[
    [10, 139, 80, 0, 0, 27.1, 1.441, 57],
    [10, 115, 70, 10, 100, 40.1, 0.421, 30],
    [5, 196, 65, 30, 55, 37.1, 0.247, 41],
    [7, 97, 75, 31, 213, 24.5, 0.845, 37],
    [6, 109, 76, 0, 52, 43.2, 2.041, 33]
]
predictions2 = []
for h in range(len(New_prediction)):
    probabilities2 = {}
    for classValue, classSummaries in summaries.items():
        probabilities2[classValue] = 1 * probability_label[int(classValue)]
       
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]            
            x = New_prediction[h][i]
            probabilities2[classValue] *= calculateProbability(x, mean, stdev)
    if probabilities2[0] > probabilities2[1]:
        predictions2.append(0)
    else:
        predictions2.append(1)


print()
print("Test result for H.W dataset is", predictions2)

Split 768 rows into train=514 and test=254 rows
Accuracy for testing dataset: 75.196850

Test result for H.W dataset is [1, 1, 1, 0, 1]
