In [45]:
import csv
 
def loadCsv(filename):
    lines = csv.reader(open(filename, "r"))
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [x for x in dataset[i]]
    return dataset

In [46]:
def convertCatToNum(dataset):
    for row in dataset:
        if (row[0] == 'vhigh'): row[0] = 1;
        elif (row[0] == 'high'): row[0] = 2;
        elif (row[0] == 'med'): row[0] = 3;
        elif (row[0] == 'low'): row[0] = 4;
        
        if (row[1] == 'vhigh'): row[1] = 1;
        elif (row[1] == 'high'): row[1] = 2;
        elif (row[1] == 'med'): row[1] = 3;
        elif (row[1] == 'low'): row[1] = 4;
        
        if (row[2] == '2'): row[2] = 2;
        elif (row[2] == '3'): row[2] = 3;
        elif (row[2] == '4'): row[2] = 4;
        elif (row[2] == '5'): row[2] = 5;
        elif (row[2] == 'more' or row[2] == '5more'): row[2] = 6;
        
        if (row[3] == '2'): row[3] = 2;
        elif (row[3] == '4'): row[3] = 4;
        elif (row[3] == 'more'): row[3] = 5;
        
        if (row[4] == 'small'): row[4] = 1;
        elif (row[4] == 'med'): row[4] = 2;
        elif (row[4] == 'big'): row[4] = 3;
        
        if (row[5] == 'low'): row[5] = 1;
        elif (row[5] == 'med'): row[5] = 2;
        elif (row[5] == 'high'): row[5] = 3;
        
        if (row[6] == 'unacc'): row[6] = 1;
        elif (row[6] == 'acc'): row[6] = 2;
        elif (row[6] == 'good'): row[6] = 3;
        elif (row[6] == 'vgood'): row[6] = 4;

In [47]:
import random

def splitDataset(dataset, splitRatio):
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet, copy]

In [48]:
def separateByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

In [49]:
import math

def mean(numbers):
    return sum(numbers)/float(len(numbers))
 
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)

In [50]:
def summarize(dataset):
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

In [51]:
def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    return summaries

In [52]:
def calculateProbability(x, mean, stdev):
    
    if stdev == 0:
        return 1. if x == mean else .1
    
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * math.pow(stdev, 2))) * exponent

In [53]:
def calculateClassProbabilities(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= calculateProbability(x, mean, stdev)
    return probabilities

In [54]:
def predict(summaries, inputVector):
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

In [55]:
carDS = loadCsv("carData.csv")

In [56]:
convertCatToNum(carDS)

In [57]:
splittedCarDS = splitDataset(carDS, 0.70)

In [58]:
trainSet = splittedCarDS[0]
testSet = splittedCarDS[1]

In [59]:
summary = summarizeByClass(trainSet)

In [60]:
def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

In [62]:
def getAccuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [94]:
accs = []

for i in range(100):
    predictions = getPredictions(summary, testSet)
    accuracy = getAccuracy(testSet, predictions)
    accs.append(accuracy)

In [96]:
maxAccuracy = max(accs)

print('Em 100 iterações, a melhor acurácia do NaiveBayes implementado foi: {0}'.
      format(maxAccuracy))

Em 100 iterações, a melhor acurácia do NaiveBayes implementado foi: 70.32755298651252


In [97]:
from sklearn.naive_bayes import GaussianNB

In [98]:
nb = GaussianNB()

In [99]:
X_train = []
X_test = []
y_train = []
y_test = []

for i in range(len(trainSet)):
    newList = list(trainSet[i])
    del newList[-1]
    X_train.append(newList)

for i in range(len(trainSet)):
    newList = list(trainSet[i])
    y_train.append(newList[-1])
    
for i in range(len(testSet)):
    newList = list(testSet[i])
    del newList[-1]
    X_test.append(newList)
    
for i in range(len(testSet)):
    newList = list(testSet[i])
    y_test.append(newList[-1])

In [100]:
nb.fit(X_train, y_train)

GaussianNB(priors=None)

In [101]:
accs = []

for i in range(100):
    predictions = nb.predict(X_test)
    accuracy = getAccuracy(testSet, predictions)
    accs.append(accuracy)

In [102]:
maxAccuracy = max(accs)

print('Em 100 iterações, a melhor acurácia do NaiveBayes do scikit-learn foi: {0}'.
      format(maxAccuracy))

Em 100 iterações, a melhor acurácia do NaiveBayes do scikit-learn foi: 76.878612716763
