# Naive Bayes - Trabalho

## QuestÃ£o 1

Implemente um classifacor Naive Bayes para o problema de predizer a qualidade de um carro. Para este fim, utilizaremos um conjunto de dados referente a qualidade de carros, disponÃ­vel no [UCI](https://archive.ics.uci.edu/ml/datasets/car+evaluation). Este dataset de carros possui as seguintes features e classe:

** Attributos **
1. buying: vhigh, high, med, low
2. maint: vhigh, high, med, low
3. doors: 2, 3, 4, 5, more
4. persons: 2, 4, more
5. lug_boot: small, med, big
6. safety: low, med, high

** Classes **
1. unacc, acc, good, vgood

## QuestÃ£o 2
Crie uma versÃ£o de sua implementaÃ§Ã£o usando as funÃ§Ãµes disponÃ­veis na biblioteca SciKitLearn para o Naive Bayes ([veja aqui](http://scikit-learn.org/stable/modules/naive_bayes.html)) 

## QuestÃ£o 3

Analise a acurÃ¡cia dos dois algoritmos e discuta a sua soluÃ§Ã£o.

In [9]:
import numpy as np
import pandas as pd
import random

In [2]:
def numColumns(dataset):
    return len(dataset.columns)

In [10]:
def splitDataset(dataset, splitRatio):
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    testSet = list(dataset)
    
    while len(trainSet) < trainSize:
        index = random.randrange(len(testSet))
        trainSet.append(testSet.pop(index))
        
    return [trainSet, testSet]

In [11]:
def separateFeaturesByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset.iloc[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

In [12]:
def tableFrequency(dataset):
    dic = {}
    attributes = dataset.columns
    for attribute in attributes:
        for value in dataset[attribute].unique():
            if value not in dic.keys():
                dic[value] = []
            dic[value].append((dataset.groupby(attribute)[attribute].count()[value], attribute))
    for value in dataset[dataset.columns[-1]].unique(): # Delete class (the last attribute)
        del dic[value]
        
    return dic

In [21]:
def frequencyByClass(dataset):
    separated = separateFeaturesByClass(dataset)
    frequency = {}
    for classValue, instances in separated.items():
        frequency[classValue] = tableFrequency(pd.DataFrame(instances))
        
    for classValue in dataset.classe.unique():
        for attr in dataset.columns[:-1]:
            for value in dataset[attr]:
                if value not in frequency[classValue]:
                    frequency[classValue][value] = [(0, attr)]    

    return frequency

In [14]:
def calculateProbByAttribute(dataset, attr):
    frequencyClass = frequencyByClass(dataset)
    frequencyClassValue = {key: [] for key in frequencyClass.keys()}
    totalValuesByClass = {key: 0 for key in frequencyClass.keys()}
    freqByValue = {key: 0 for key in dataset[:][attr]}

    frequenciesClassKeys, frequenciesClassValues = zip(*frequencyByClass(dataset).items())
    for classIndex in range(len(frequenciesClassKeys)):
    
        keys = frequenciesClassValues[classIndex].keys()

        for key in keys:
            for freq in frequenciesClassValues[classIndex][key]:                      
                if freq[1] == attr:
                    frequencyClassValue[frequenciesClassKeys[classIndex]].append((key, freq[0]))
                    freqByValue[key] += freq[0]
                    totalValuesByClass[frequenciesClassKeys[classIndex]] += freq[0]
            
    totalValues = sum(totalValuesByClass.values())
    probByClass = {key: totalValuesByClass[key]/totalValues for key in totalValuesByClass}
    probByValue = {key: value/totalValues for key, value in freqByValue.items()}
    probByClassValue = {key: [] for key in totalValuesByClass}
        
    for key in frequenciesClassKeys:
        for valueTuple, freqTuple in frequencyClassValue[key]:
            probByClassValue[key].append((valueTuple, freqTuple/totalValuesByClass[key]))
        
    return [probByClass, probByValue, probByClassValue]   

In [15]:
def tableProbability(dataset):
    tableProb = {}
    for attr in dataset.columns[:-1]:
        tableProb[attr] = calculateProbByAttribute(dataset, attr)
        
    return tableProb

In [16]:
def predict(summaries, inputVector):

    higherProbability = -1
    classHigherProbability = None
    
    attributes = list(summaries.keys())
    randomAttribute = attributes[0]
    classes = list(summaries[randomAttribute][0].keys())
    
    for classValue in classes:
        prob = 1
        for index in range(len(attributes)):
            probClass = summaries[attributes[index]][0][classValue]
            probValue = summaries[attributes[index]][1][inputVector[index]]
            valueInClass = summaries[attributes[index]][2][classValue]

            for probValueClass in valueInClass:
                if probValueClass[0] == inputVector[index]:
                    probIntersValueClass = probValueClass[1]
                    
            prob = prob * (probIntersValueClass / probValue)
        prob = prob * probClass
        
        if prob > higherProbability:
            higherProbability = prob
            classHigherProbability = classValue
            
    return classHigherProbability

In [17]:
def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet.iloc[i])
        predictions.append(result)
    return predictions

In [18]:
def getAccuracy(testSet, predictions):
    correct = 0

    for i in range(len(testSet)):
        if testSet.iloc[i][-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [19]:
def main():
    filename = 'carData.csv'

    datasetColumns = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'classe']
    df = pd.read_csv(filename, sep=',', names=datasetColumns)
    dataset = df.values.tolist()
    
    splitRatio = 0.75
    trainSet, testSet = splitDataset(dataset, splitRatio)
    
    print(('Dataset {0}, Training {1}, Test {2}').format(len(dataset), len(trainSet), len(testSet)))

    trainingSet = pd.DataFrame(trainSet, columns=datasetColumns)
    testingSet = pd.DataFrame(testSet, columns=datasetColumns)
    
    summary = tableProbability(trainingSet)
    predictions = getPredictions(summary, testingSet)
    accuracy = getAccuracy(testingSet, predictions)
    
    print(('AcurÃ¡cia de {:.3f}%').format(accuracy))

In [22]:
main()

Dataset 1728, Training 1296, Test 432
AcurÃ¡cia de 74.306%
