# Naive Bayes - Trabalho
### Aluno : Joel Oliveira Ribeiro - 371822
------------------------------------------

In [1]:
#Import de bibliotecas 
%matplotlib inline 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import DataFrame
import math 
from sklearn.model_selection import train_test_split
import random

## Questão 1

Implementar um classifacor Naive Bayes para o problema de predizer a qualidade de um carro. Para este fim, utilizar um conjunto de dados referente a qualidade de carros, disponível no [UCI](https://archive.ics.uci.edu/ml/datasets/car+evaluation). 

### 1.1 Carregando o dataFrame e tratando os dados

In [2]:
# Carregando o dataFrame
df = pd.read_csv("carData.csv")
print(df)

      vhigh vhigh.1      2   2.1  small   low  unacc
0     vhigh   vhigh      2     2  small   med  unacc
1     vhigh   vhigh      2     2  small  high  unacc
2     vhigh   vhigh      2     2    med   low  unacc
3     vhigh   vhigh      2     2    med   med  unacc
4     vhigh   vhigh      2     2    med  high  unacc
5     vhigh   vhigh      2     2    big   low  unacc
6     vhigh   vhigh      2     2    big   med  unacc
7     vhigh   vhigh      2     2    big  high  unacc
8     vhigh   vhigh      2     4  small   low  unacc
9     vhigh   vhigh      2     4  small   med  unacc
10    vhigh   vhigh      2     4  small  high  unacc
11    vhigh   vhigh      2     4    med   low  unacc
12    vhigh   vhigh      2     4    med   med  unacc
13    vhigh   vhigh      2     4    med  high  unacc
14    vhigh   vhigh      2     4    big   low  unacc
15    vhigh   vhigh      2     4    big   med  unacc
16    vhigh   vhigh      2     4    big  high  unacc
17    vhigh   vhigh      2  more  small   low 

In [3]:
# Tratamento dos dados, necessário passar algumas features de caracteres para números
df['vhigh'] = pd.factorize(df['vhigh'])[0]
df['vhigh.1'] = pd.factorize(df['vhigh.1'])[0]
df['small'] = pd.factorize(df['small'])[0]
df['low'] = pd.factorize(df['low'])[0]
df['unacc'] = pd.factorize(df['unacc'])[0]

print(df)

df['2'] = pd.factorize(df['2'])[0]
df['2.1'] = pd.factorize(df['2.1'])[0]

print('--------------------')
print(df)

# Mudando para tipo numpy array
df = df.values
print('--------------------')
print(df)

      vhigh  vhigh.1      2   2.1  small  low  unacc
0         0        0      2     2      0    0      0
1         0        0      2     2      0    1      0
2         0        0      2     2      1    2      0
3         0        0      2     2      1    0      0
4         0        0      2     2      1    1      0
5         0        0      2     2      2    2      0
6         0        0      2     2      2    0      0
7         0        0      2     2      2    1      0
8         0        0      2     4      0    2      0
9         0        0      2     4      0    0      0
10        0        0      2     4      0    1      0
11        0        0      2     4      1    2      0
12        0        0      2     4      1    0      0
13        0        0      2     4      1    1      0
14        0        0      2     4      2    2      0
15        0        0      2     4      2    0      0
16        0        0      2     4      2    1      0
17        0        0      2  more      0    2 

### 1.2 Criando as funções para o classificador do tipo Naive Bayes
Adaptando as funções do próprio notebook da aula 

In [9]:
def splitDataset(dataset, splitRatio):
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
        
    return [trainSet, copy]
    
def separateByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

def mean(numbers):
    return sum(numbers)/float(len(numbers))
 
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)
    
def summarize(dataset):
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries
    
def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    return summaries
    
def calculateProbability(x, mean, stdev):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * math.pow(stdev, 2))) * exponent
    
def calculateClassProbabilities(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= calculateProbability(x, mean, stdev)
    return probabilities
    
def predict(summaries, inputVector):
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel
    
def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions
    
def getAccuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [10]:
# Separando treino (67%) de teste (33%)
X_train,X_test = splitDataset(df,0.67)

In [13]:
summ = summarize(X_train)
print(summ)
print('--------------------------------------------------------')
summBy = summarizeByClass(X_train)
print(summBy)

[(1.4935177182368193, 1.1095688643707255), (1.497839239412273, 1.1056807537218771), (1.5073465859982713, 1.1095634736654678), (1.0051858254105446, 0.81506622156905), (1.0008643042350907, 0.8230035798648657), (1.0121002592912705, 0.8096683269265)]
--------------------------------------------------------
{0: [(1.3506815365551426, 1.1215004157709034), (1.3605947955390334, 1.1105555550338533), (1.4745972738537794, 1.1167733364496824), (0.78934324659231725, 0.8306984952291961), (0.92812887236679054, 0.8274384780660208), (1.1970260223048328, 0.8495166210242404)], 1: [(1.5521235521235521, 1.0039579271859078), (1.5945945945945945, 1.0462788110792012), (1.5598455598455598, 1.0922957720211002), (1.501930501930502, 0.5009643210501636), (1.1158301158301158, 0.8033966085081455), (0.53281853281853286, 0.4998877489153622)], 3: [(2.6222222222222222, 0.49031014715590004), (2.6666666666666665, 0.4767312946227963), (1.4888888888888889, 1.1000459127241053), (1.4666666666666666, 0.5045249791095131), (1.0, 

In [16]:
print(getPredictions(summBy,X_test))

ZeroDivisionError: float division by zero

## Questão 2
Criar uma versão de sua implementação usando as funções disponíveis na biblioteca SciKitLearn para o Naive Bayes ([veja aqui](http://scikit-learn.org/stable/modules/naive_bayes.html)) 

In [24]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(df, df[:,6]).predict(df)
print("Number of mislabeled points out of a total %d points : %d"% (df.shape[0],(df[:,6] != y_pred).sum()))

Number of mislabeled points out of a total 7 points : 0
