# Naive Bayes

## Questão 1

### Classificador

In [193]:
import csv
import math
import random
from collections import defaultdict

def try_int(not_int_dict):
    
    def _try_int(x):
        
        try:
            return int(x)
        except:
            return not_int_dict[x]
    return _try_int

preprocess_list = [
    lambda x: {'vhigh': 4, 'high': 3, 'med': 2, 'low': 1}[x],
    lambda x: {'vhigh': 4, 'high': 3, 'med': 2, 'low': 1}[x],
    try_int({'5more': 6}),
    try_int({'more': 5}),
    lambda x: {'big': 3, 'med': 2, 'small': 1}[x],
    lambda x: {'high': 3, 'med': 2, 'low': 1}[x],
    lambda x: {'vgood': 4, 'good': 3, 'acc': 2, 'unacc': 1}[x]
]

def preprocess_row(row):
    
    return [preprocess_list[i](v) for i, v in enumerate(row)]

def loadCsv(filename):
    rows = csv.reader(open(filename, "r"))
    ds = [preprocess_row(row) for row in rows]

    return ds

def splitDataset(dataset, splitRatio):
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet, copy]

def separateByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

def mean(numbers):
    return sum(numbers)/float(len(numbers))
 
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)

def summarize(dataset):
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    return summaries

def calculateProbability(x, mean, stdev):
    
    if stdev == 0:
        return 1. if x == mean else .1        
    
    exponent = math.exp(-(x-mean)**2/(2*stdev**2))
    return (1 / ((2*math.pi) * stdev ** 2)**.5) * exponent

def calculateClassProbabilities(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= calculateProbability(x, mean, stdev)
    return probabilities

def predict(summaries, inputVector):
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel, bestProb

def getPredictions(summaries, testSet):
    predictions = []
    probs = []
    for i in range(len(testSet)):
        result, prob = predict(summaries, testSet[i])
        predictions.append(result)
        probs.append(prob)
    return predictions, probs

def getAccuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

def train_and_test(filepath):
    
    accs = []
    
    ds = loadCsv(filepath)
    
    for i in range(100):

        train, test = splitDataset(ds, splitRatio=.67)
        summaries = summarizeByClass(train)

        predictions, _ = getPredictions(summaries, test)
        
        accs.append(getAccuracy(test, predictions))
    
    return accs

accs = train_and_test("carData.csv")

pd.Series(accs).describe().to_frame().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,100.0,77.073555,1.723154,71.978984,76.007005,77.145359,78.283713,81.260946


### Questão 2

In [173]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [194]:
ds = loadCsv("carData.csv")
df = pd.DataFrame(ds, columns=range(7))

X = df[list(range(6))]
y = df[6]

In [195]:
accs = []

for x in range(100):

    nb = GaussianNB()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33)
    
    nb.fit(X_train, y_train)
    
    acc = accuracy_score(y_test, nb.predict(X_test))
    accs.append(acc)

In [196]:
pd.Series(accs).describe().to_frame().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,100.0,0.76718,0.014778,0.732049,0.758319,0.767075,0.777583,0.798599
