# Igor Menezes Chaves Moura - 374184
# Lucas Primo Fernandes Muraro - 374192

In [1]:
import math
import random
import pandas as pd
import numpy as np 
from sklearn.utils import shuffle
from sklearn.naive_bayes import MultinomialNB
import json
from sklearn.preprocessing import LabelEncoder,MinMaxScaler

#### Lendo os dados do arquivo para um dataframe
##### Obs: Editei o csv e inseri os nomes da colunas diretamente no arquivo por convenciência

In [2]:
#df = pd.read_csv('pima-indians-diabetes.csv',header=None)
df = pd.read_csv('carData.csv')

In [3]:
df = shuffle(df)
df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,quality
118,vhigh,high,2,4,small,med,unacc
438,high,vhigh,2,2,big,low,unacc
1275,med,low,5more,2,big,low,unacc
552,high,high,2,4,med,low,unacc
590,high,high,3,more,med,high,acc
9,vhigh,vhigh,2,4,small,low,unacc
303,vhigh,med,5more,2,big,low,unacc
558,high,high,2,more,small,low,unacc
467,high,vhigh,3,2,big,high,unacc
111,vhigh,high,2,2,med,low,unacc


In [4]:
attrs = df.columns[:-1].values
attrs

array(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'],
      dtype=object)

In [5]:
classes = pd.unique(df['quality'])
classes

array(['unacc', 'acc', 'good', 'vgood'], dtype=object)

#### Função para calcular a probabilidade multinomial

In [6]:
def calculateProbabilityTable(dataset,attr):
    
    
    #Valores unicos do atributo
    poss_vals = pd.unique(dataset[attr])
    
    classes = pd.unique(dataset['quality'])
 
    tabela_frequencia = {}

    for poss_val in poss_vals:
        tabela_frequencia[poss_val] = {}
        
        for classe in classes: 
            
            count_classe = float(len(dataset[dataset['quality'] == classe]))
                        
            #print("Numero de caras com a classe %s: %f" % (classe,count_classe))
            
            sub_dataset = dataset.query(attr + "=='" +  poss_val + "' &" + "quality =='" + classe + "'")
            
            count_attr = float(len(sub_dataset))
            
            #print("Numero de caras com %s = %s e classe = %s : %f" % (attr,poss_val,classe,count_attr))
            
            count = float(count_attr/count_classe)*1.0
            
            #print("%f/%f = %f" % (count_attr,count_classe,count))
            
            tabela_frequencia[poss_val][classe] = count
            
        
    return tabela_frequencia

In [7]:
def calculateProbabilityTables(dataset):
    
    tables = {}
    
    for attr in df.columns[:-1]:
        
        tables[attr] = calculateProbabilityTable(dataset,attr)
    
    return tables

#### Com isso podemos calcular as probabilidades das classes

In [8]:
def calculateClassProbabilities(probabilityTables, inputVector, attrs,classes):
    
    probabilities = {}
    
    for classe in classes:
        
        classProb = 1.0
        
        for index,attr in enumerate(attrs[:-1]):
            
            attrTable = probabilityTables[attr]
            
            
            #print(inputVector)
            
            #print("Indice do atributo %s é %d e a classe é %s" % (attr,index,classe))
  
            
            classAttrProb = attrTable[inputVector[index]][classe]
            
            classProb *= classAttrProb
            
        probabilities[classe] = classProb
        
    
    return probabilities

#### Agora finalmente podemos fazer a função de predição

In [9]:
def predict(probabilityTables, inputVector, attrs, classes):
    probabilities = calculateClassProbabilities(probabilityTables, inputVector,attrs,classes)
    bestLabel, bestProb = None, -1
    for classe in classes:
        classProb = probabilities[classe]
        if bestLabel is None or classProb > bestProb:
            bestProb = classProb
            bestLabel = classe
    return bestLabel

In [10]:
def getPredictions(probabilityTables, testSet, attrs,classes):
    predictions = []
    for i in range(len(testSet)):
        inputVector = testSet.iloc[i].values.tolist()
        #print(inputVector)
        result = predict(probabilityTables, inputVector,attrs,classes)
        predictions.append(result)
    return predictions

#### Função para dividir o dataset entre treino e teste

In [11]:
def splitDataset(dataset, splitRatio):
    trainSize = int(len(dataset) * splitRatio)
    trainSet = dataset.iloc[:trainSize-1]
    testSet = dataset.iloc[trainSize:]
    
    return [trainSet,testSet]

In [12]:
[train,test] = splitDataset(df,0.7)
train

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,quality
118,vhigh,high,2,4,small,med,unacc
438,high,vhigh,2,2,big,low,unacc
1275,med,low,5more,2,big,low,unacc
552,high,high,2,4,med,low,unacc
590,high,high,3,more,med,high,acc
9,vhigh,vhigh,2,4,small,low,unacc
303,vhigh,med,5more,2,big,low,unacc
558,high,high,2,more,small,low,unacc
467,high,vhigh,3,2,big,high,unacc
111,vhigh,high,2,2,med,low,unacc


#### Função para calcular a acurácia

In [13]:
def getAccuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet.iloc[i][-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [14]:
probabilityTables = calculateProbabilityTables(df)

In [15]:
predictions = getPredictions(probabilityTables,test,attrs,classes)

In [16]:
accuracy = getAccuracy(test,predictions)
accuracy

61.849710982658955

In [17]:
skNB = MultinomialNB()
train_data = train[train.columns[:-1]]
train_target = train[train.columns[-1]]

test_data = test[test.columns[:-1]]

#y_pred = skNB.fit(train_data.values, train_target.values).predict(test_data)

In [18]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [19]:
train_data_categorical = MultiColumnLabelEncoder().fit_transform(train_data)
train_target_categorical = LabelEncoder().fit_transform(train_target)
test_data_categorical = MultiColumnLabelEncoder().fit_transform(test_data)
test_categorical = MultiColumnLabelEncoder().fit_transform(test)

In [20]:
y_pred = skNB.fit(train_data_categorical, train_target_categorical).predict(test_data_categorical)

In [21]:
skAccuracy = getAccuracy(test_categorical,y_pred)
skAccuracy

70.52023121387283

In [22]:
print("Nossa acurácia é de %f e a do sklearn é de %f" % (accuracy,skAccuracy))

Nossa acurácia é de 61.849711 e a do sklearn é de 70.520231
