In [27]:
import math
import random
import pandas as pd
import numpy as np 
from sklearn.preprocessing import LabelEncoder,MinMaxScaler

#### Lendo os dados do arquivo para um dataframe
##### Obs: Editei o csv e inseri os nomes da colunas diretamente no arquivo por convenciência

In [53]:
#df = pd.read_csv('pima-indians-diabetes.csv',header=None)
df = pd.read_csv('carData.csv')

In [29]:
df.head(10)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,quality
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
5,vhigh,vhigh,2,2,med,high,unacc
6,vhigh,vhigh,2,2,big,low,unacc
7,vhigh,vhigh,2,2,big,med,unacc
8,vhigh,vhigh,2,2,big,high,unacc
9,vhigh,vhigh,2,4,small,low,unacc


#### Agora temos que converter esses dados para valores numéricos para podermos fazer cálculos em cima deles, podemos fazer isso utilizando a seguinte classe

In [30]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns 

    def fit(self,X,y=None):
        return self 

    def transform(self,X):
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [31]:
df_categorical = MultiColumnLabelEncoder(columns=df.columns).fit_transform(df)
#scaler = MinMaxScaler()
#scaler.fit(df_categorical.values)
#df_categorical_normal = scaler.transform(df_categorical.values)
df_categorical

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,quality
0,4,4,1,1,3,2,3
1,4,4,1,1,3,3,3
2,4,4,1,1,3,1,3
3,4,4,1,1,2,2,3
4,4,4,1,1,2,3,3
5,4,4,1,1,2,1,3
6,4,4,1,1,1,2,3
7,4,4,1,1,1,3,3
8,4,4,1,1,1,1,3
9,4,4,1,2,3,2,3


In [32]:
safety = df_categorical[df_categorical['quality'] == 3]['safety']
np.mean(safety.values)

2.0661157024793386

#### Separando os dados por classe

In [33]:
def separateByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

#### Agora precisamos da média e do desvio padrão

In [34]:
def mean(numbers):
    return sum(numbers)/float(len(numbers))
 
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)

#### Agora podemos unir todas essas funções e sumarizar o dataset por classes

In [35]:
def summarize(dataset):
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

In [36]:
def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    return summaries

#### Agora estamos prontos para fazer predições

#### Função para calcular a probabilidade multinomial

In [58]:
def calculateProbabilityTable(dataset,attr):
    
    #Valores unicos do atributo
    poss_vals = pd.unique(dataset[attr])
    
    classes = pd.unique(dataset['quality'])
 
    tabela_frequencia = {}

    for poss_val in poss_vals:
        tabela_frequencia[poss_val] = {}
        
        for classe in classes: 
            
            count_classe = len(dataset[dataset['quality'] == classe])
            
            sub_dataset = dataset.query(attr + "=='" +  poss_val + "' &" + "quality =='" + classe + "'")
            
            count_attr = len(sub_dataset)
            
            count = count_attr/count_classe
            
            tabela_frequencia[poss_val][classe] = '%d / %d' % (count_attr,count_classe)
            
        
    return tabela_frequencia
    
calculateProbabilityTable(df,'safety')

{'low': {'unacc': '576 / 1210',
  'acc': '0 / 384',
  'vgood': '0 / 65',
  'good': '0 / 69'},
 'med': {'unacc': '357 / 1210',
  'acc': '180 / 384',
  'vgood': '0 / 65',
  'good': '39 / 69'},
 'high': {'unacc': '277 / 1210',
  'acc': '204 / 384',
  'vgood': '65 / 65',
  'good': '30 / 69'}}

In [None]:
def calculateProbabilityTables(dataset):
    
    tables = [calculateProbabilityTable(dataset,attr) for attr in df.columns[:-1]]
    
    return tables

#### Com isso podemos calcular as probabilidades das classes

In [38]:
def calculateClassProbabilities(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= calculateProbability(x, mean, stdev)
    return probabilities

#### Agora finalmente podemos fazer a função de predição

In [39]:
def predict(summaries, inputVector):
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

In [40]:
def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

#### Função para dividir o dataset entre treino e teste

In [41]:
def splitDataset(dataset, splitRatio):
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet, copy]

In [42]:
[train,test] = splitDataset(df_categorical.values,0.7)

#### Função para calcular a acurácia

In [43]:
def getAccuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [44]:
summaries = summarizeByClass(train)
summaries

{3: [(2.550415183867141, 1.1780894365578845),
  (2.5361803084223014, 1.1695089518053632),
  (2.4709371293001188, 1.1156615367971836),
  (1.7627520759193358, 0.8202821013965731),
  (2.052194543297746, 0.8196687240677468),
  (2.0640569395017794, 0.7134114999426915)],
 1: [(2.386861313868613, 1.0910771985625842),
  (2.4124087591240877, 1.0697262842178086),
  (2.5875912408759123, 1.0900781472456387),
  (2.4744525547445257, 0.5002606202939549),
  (1.8795620437956204, 0.7961113075978663),
  (1.9197080291970803, 0.9985953076364137)],
 4: [(2.357142857142857, 0.4849656045851727),
  (2.142857142857143, 0.7830966850386145),
  (2.8333333333333335, 1.0573013661495898),
  (2.4761904761904763, 0.5054867366041313),
  (1.4047619047619047, 0.496795772414547),
  (1.0, 0.0)],
 2: [(2.32, 0.4712120714991613),
  (2.28, 0.45355736761107285),
  (2.44, 1.0909647481163873),
  (2.52, 0.5046720495044485),
  (1.9, 0.7889543583705187),
  (2.16, 0.9971387638065795)]}

In [45]:
predictions = getPredictions(summaries,test)

ZeroDivisionError: float division by zero