In [1]:
import math
import random
import pandas as pd
import numpy as np 
from sklearn.utils import shuffle
from sklearn.naive_bayes import MultinomialNB
import json
#from sklearn.preprocessing import LabelEncoder,MinMaxScaler

#### Lendo os dados do arquivo para um dataframe
##### Obs: Editei o csv e inseri os nomes da colunas diretamente no arquivo por convenciência

In [2]:
#df = pd.read_csv('pima-indians-diabetes.csv',header=None)
df = pd.read_csv('carData.csv')

In [3]:
df = shuffle(df)
df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,quality
80,vhigh,vhigh,4,more,big,high,unacc
985,med,high,2,4,med,med,unacc
352,vhigh,low,3,2,small,med,unacc
1393,low,vhigh,5more,4,big,med,acc
473,high,vhigh,3,4,med,high,unacc
699,high,med,3,more,big,low,unacc
594,high,high,4,2,small,low,unacc
363,vhigh,low,3,4,med,low,unacc
570,high,high,3,2,med,low,unacc
196,vhigh,high,5more,2,big,med,unacc


In [4]:
attrs = df.columns[:-1].values
attrs

array(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'],
      dtype=object)

In [5]:
classes = pd.unique(df['quality'])
classes

array(['unacc', 'acc', 'good', 'vgood'], dtype=object)

#### Função para calcular a probabilidade multinomial

In [6]:
def calculateProbabilityTable(dataset,attr):
    
    
    #Valores unicos do atributo
    poss_vals = pd.unique(dataset[attr])
    
    classes = pd.unique(dataset['quality'])
 
    tabela_frequencia = {}

    for poss_val in poss_vals:
        tabela_frequencia[poss_val] = {}
        
        for classe in classes: 
            
            count_classe = float(len(dataset[dataset['quality'] == classe]))
                        
            print("Numero de caras com a classe %s: %f" % (classe,count_classe))
            
            sub_dataset = dataset.query(attr + "=='" +  poss_val + "' &" + "quality =='" + classe + "'")
            
            count_attr = float(len(sub_dataset))
            
            print("Numero de caras com %s = %s e classe = %s : %f" % (attr,poss_val,classe,count_attr))
            
            count = float(count_attr/count_classe)*1.0
            
            print("%f/%f = %f" % (count_attr,count_classe,count))
            
            tabela_frequencia[poss_val][classe] = count
            
        
    return tabela_frequencia

In [7]:
def calculateProbabilityTables(dataset):
    
    tables = {}
    
    for attr in df.columns[:-1]:
        
        tables[attr] = calculateProbabilityTable(dataset,attr)
    
    return tables

#### Com isso podemos calcular as probabilidades das classes

In [8]:
def calculateClassProbabilities(probabilityTables, inputVector, attrs,classes):
    
    probabilities = {}
    
    for classe in classes:
        
        classProb = 1.0
        
        for index,attr in enumerate(attrs[:-1]):
            
            attrTable = probabilityTables[attr]
            
            
            #print(inputVector)
            
            #print("Indice do atributo %s é %d e a classe é %s" % (attr,index,classe))
  
            
            classAttrProb = attrTable[inputVector[index]][classe]
            
            classProb *= classAttrProb
            
        probabilities[classe] = classProb
        
    
    return probabilities

#### Agora finalmente podemos fazer a função de predição

In [9]:
def predict(probabilityTables, inputVector, attrs, classes):
    probabilities = calculateClassProbabilities(probabilityTables, inputVector,attrs,classes)
    bestLabel, bestProb = None, -1
    for classe in classes:
        classProb = probabilities[classe]
        if bestLabel is None or classProb > bestProb:
            bestProb = classProb
            bestLabel = classe
    return bestLabel

In [10]:
def getPredictions(probabilityTables, testSet, attrs,classes):
    predictions = []
    for i in range(len(testSet)):
        inputVector = testSet.iloc[i].values.tolist()
        #print(inputVector)
        result = predict(probabilityTables, inputVector,attrs,classes)
        predictions.append(result)
    return predictions

#### Função para dividir o dataset entre treino e teste

In [11]:
def splitDataset(dataset, splitRatio):
    trainSize = int(len(dataset) * splitRatio)
    trainSet = dataset.iloc[:trainSize-1]
    testSet = dataset.iloc[trainSize:]
    
    return [trainSet,testSet]

In [12]:
[train,test] = splitDataset(df,0.7)
train

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,quality
80,vhigh,vhigh,4,more,big,high,unacc
985,med,high,2,4,med,med,unacc
352,vhigh,low,3,2,small,med,unacc
1393,low,vhigh,5more,4,big,med,acc
473,high,vhigh,3,4,med,high,unacc
699,high,med,3,more,big,low,unacc
594,high,high,4,2,small,low,unacc
363,vhigh,low,3,4,med,low,unacc
570,high,high,3,2,med,low,unacc
196,vhigh,high,5more,2,big,med,unacc


#### Função para calcular a acurácia

In [13]:
def getAccuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet.iloc[i][-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [14]:
probabilityTables = calculateProbabilityTables(df)

Numero de caras com a classe unacc: 1210.000000
Numero de caras com buying = vhigh e classe = unacc : 360.000000
360.000000/1210.000000 = 0.297521
Numero de caras com a classe acc: 384.000000
Numero de caras com buying = vhigh e classe = acc : 72.000000
72.000000/384.000000 = 0.187500
Numero de caras com a classe good: 69.000000
Numero de caras com buying = vhigh e classe = good : 0.000000
0.000000/69.000000 = 0.000000
Numero de caras com a classe vgood: 65.000000
Numero de caras com buying = vhigh e classe = vgood : 0.000000
0.000000/65.000000 = 0.000000
Numero de caras com a classe unacc: 1210.000000
Numero de caras com buying = med e classe = unacc : 268.000000
268.000000/1210.000000 = 0.221488
Numero de caras com a classe acc: 384.000000
Numero de caras com buying = med e classe = acc : 115.000000
115.000000/384.000000 = 0.299479
Numero de caras com a classe good: 69.000000
Numero de caras com buying = med e classe = good : 23.000000
23.000000/69.000000 = 0.333333
Numero de caras c

In [15]:
json.dumps(probabilityTables["buying"])

'{"vhigh": {"unacc": 0.2975206611570248, "acc": 0.1875, "good": 0.0, "vgood": 0.0}, "med": {"unacc": 0.22148760330578512, "acc": 0.2994791666666667, "good": 0.3333333333333333, "vgood": 0.4}, "low": {"unacc": 0.21322314049586777, "acc": 0.23177083333333334, "good": 0.6666666666666666, "vgood": 0.6}, "high": {"unacc": 0.26776859504132233, "acc": 0.28125, "good": 0.0, "vgood": 0.0}}'

In [16]:
predictions = getPredictions(probabilityTables,test,attrs,classes)

In [18]:
accuracy = getAccuracy(test,predictions)
accuracy

60.115606936416185

In [None]:
skNB = MultinomialNB()
