# Naive Bayes - Trabalho

## Questão 1

Implemente um classifacor Naive Bayes para o problema de predizer a qualidade de um carro. Para este fim, utilizaremos um conjunto de dados referente a qualidade de carros, disponível no [UCI](https://archive.ics.uci.edu/ml/datasets/car+evaluation). Este dataset de carros possui as seguintes features e classe:

** Attributos **
1. buying: vhigh, high, med, low
2. maint: vhigh, high, med, low
3. doors: 2, 3, 4, 5, more
4. persons: 2, 4, more
5. lug_boot: small, med, big
6. safety: low, med, high

** Classes **
1. unacc, acc, good, vgood





In [8]:
import csv
import pandas as pd
import random

In [9]:
def loadCsv(filename):
    lines = csv.reader(open(filename, "r"))
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [x for x in dataset[i]]
    return dataset

In [10]:
filename = 'carData.csv'
dataset = loadCsv(filename)
print(('O arquivo {0} foi carregado com {1} linhas').format(filename, len(dataset)))

O arquivo carData.csv foi carregado com 1728 linhas


In [12]:
def splitDataset(dataset, splitRatio):
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    testSet = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(testSet))
        trainSet.append(testSet.pop(index))
    return [trainSet, testSet]

In [13]:
splitRatio = 0.7
newDF = df.values.tolist()
train, teste = splitDataset(newDF, splitRatio)

In [14]:
def separateFeaturesByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        row = dataset.iloc[i]
        if (row[-1] not in separated):
            separated[row[-1]] = []
        separated[row[-1]].append(row)
    return separated

In [23]:
def tableFrequency(dataset):
    dic = {}
    attributes = dataset.columns
    for attribute in attributes:
        for value in dataset[attribute].unique():
            if value not in dic.keys():
                dic[value] = []
            dic[value].append((dataset.groupby(attribute)[attribute].count()[value], attribute))
    for value in dataset[dataset.columns[-1]].unique(): # Delete class (the last attribute)
        del dic[value]
        
    return dic

In [16]:
def frequencyByClass(dataset):
    separated = separateFeaturesByClass(dataset)
    frequency = {}
    for classValue, instances in separated.items():
        frequency[classValue] = tableFrequency(pd.DataFrame(instances))
        
    for classValue in dataset.classe.unique():
        #index = 0
        for attr in dataset.columns[:-1]:
            for value in dataset[attr]:
                if value not in frequency[classValue]:
                    frequency[classValue][value] = [(0, attr)]
            #index += 1

    return frequency

In [17]:
def calculateProbByAttribute(dataset, attr):
    frequencyClass = frequencyByClass(dataset)
    frequencyClassValue = {key: [] for key in frequencyClass.keys()}
    totalValuesByClass = {key: 0 for key in frequencyClass.keys()}
    freqByValue = {key: 0 for key in dataset[:][attr]}

    frequenciesClassKeys, frequenciesClassValues = zip(*frequencyByClass(dataset).items())
    for classIndex in range(len(frequenciesClassKeys)):
    
        keys = frequenciesClassValues[classIndex].keys()

        for key in keys:
            for freq in frequenciesClassValues[classIndex][key]:                      
                if freq[1] == attr:
                    frequencyClassValue[frequenciesClassKeys[classIndex]].append((key, freq[0]))
                    freqByValue[key] += freq[0]
                    totalValuesByClass[frequenciesClassKeys[classIndex]] += freq[0]
            
    totalValues = sum(totalValuesByClass.values())
    probByClass = {key: totalValuesByClass[key]/totalValues for key in totalValuesByClass}
    probByValue = {key: value/totalValues for key, value in freqByValue.items()}
    probByClassValue = {key: [] for key in totalValuesByClass}
        
    for key in frequenciesClassKeys:
        for valueTuple, freqTuple in frequencyClassValue[key]:
            probByClassValue[key].append((valueTuple, freqTuple/totalValuesByClass[key]))
        
    return [probByClass, probByValue, probByClassValue]   

In [18]:
def tableProbability(dataset):
    tableProb = {}
    for attr in dataset.columns[:-1]:
        tableProb[attr] = calculateProbByAttribute(dataset, attr)
        
    return tableProb

In [25]:
def predict(summaries, inputVector):

    higherProbability = -1
    classHigherProbability = None
    
    attributes = list(summaries.keys())
    randomAttribute = attributes[0]
    classes = list(summaries[randomAttribute][0].keys())
    
    for classValue in classes:
        prob = 1
        for index in range(len(attributes)):
            probClass = summaries[attributes[index]][0][classValue]
            probValue = summaries[attributes[index]][1][inputVector[index]]
            valueInClass = summaries[attributes[index]][2][classValue]

            for probValueClass in valueInClass:
                if probValueClass[0] == inputVector[index]:
                    probIntersValueClass = probValueClass[1]
                    
            prob = prob * (probIntersValueClass / probValue)
        prob = prob * probClass
        
        if prob > higherProbability:
            higherProbability = prob
            classHigherProbability = classValue
            
    return classHigherProbability

In [19]:
def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet.iloc[i])
        predictions.append(result)
    return predictions

In [20]:
def getAccuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet.iloc[i][-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [28]:
def main():
    filename = 'carData.csv'

    datasetColumns = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'classe']
    df = pd.read_csv(filename, sep=',', names=datasetColumns)
    dataset = df.values.tolist()
    
    splitRatio = 0.75
    trainSet, testSet = splitDataset(dataset, splitRatio)
    
    print(('Dataset {0}, Training {1}, Test {2}').format(len(dataset), len(trainSet), len(testSet)))

    trainingSet = pd.DataFrame(trainSet, columns=datasetColumns)
    testingSet = pd.DataFrame(testSet, columns=datasetColumns)
    
    summary = tableProbability(trainingSet)
    predictions = getPredictions(summary, testingSet)
    accuracy = getAccuracy(testingSet, predictions)
    
    print(('Acurácia de {:.3f}%').format(accuracy))

In [29]:
main()

Dataset 1728, Training 1296, Test 432
Acurácia de 74.769%


## Questão 2
Crie uma versão de sua implementação usando as funções disponíveis na biblioteca SciKitLearn para o Naive Bayes ([veja aqui](http://scikit-learn.org/stable/modules/naive_bayes.html)) 

In [1]:
import pandas as pd
import numpy as np
import sklearn
df = pd.read_csv("carData.csv",names=["buying","maint","doors","persons","lug_boot","safety","Classes"])

In [2]:
#Transform data into numerical
df = df.replace('vhigh', 4)
df = df.replace('high', 3)
df = df.replace('med', 2)
df = df.replace('low', 1)
df = df.replace('more', 6)
df = df.replace('big', 3)
df = df.replace('small', 1)
df = df.replace('unacc', 1)
df = df.replace('acc', 2)
df = df.replace('good', 3)
df = df.replace('vgood', 4)
df = df.replace('5more', 6)
df = df.replace('2', 2)
df = df.replace('3', 3)
df = df.replace('4', 4)

In [3]:
from sklearn.model_selection import train_test_split
x = df.drop(["Classes"],axis=1)
y = df["Classes"]

In [5]:
x_train, x_test, y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=101)

In [6]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [7]:
y_pred = gnb.fit(x_train, y_train).predict(x_test)

In [8]:
from sklearn.metrics import accuracy_score
final = accuracy_score(y_pred, y_test)
final

0.7167630057803468

## Questão 3

Analise a acurácia dos dois algoritmos e discuta a sua solução.

R: Como a acurácio dos dois foram parecidas, é possível concluir que o algoritmo implementado não se difere muito do proposto pelo scikitlearn