In [1]:
import math
import random
import pandas as pd
import numpy as np 
from sklearn.preprocessing import LabelEncoder,MinMaxScaler

#### Lendo os dados do arquivo para um dataframe
##### Obs: Editei o csv e inseri os nomes da colunas diretamente no arquivo por convenciência

In [2]:
#df = pd.read_csv('pima-indians-diabetes.csv',header=None)
df = pd.read_csv('carData.csv')

In [3]:
df.head(10)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,quality
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
5,vhigh,vhigh,2,2,med,high,unacc
6,vhigh,vhigh,2,2,big,low,unacc
7,vhigh,vhigh,2,2,big,med,unacc
8,vhigh,vhigh,2,2,big,high,unacc
9,vhigh,vhigh,2,4,small,low,unacc


#### Agora temos que converter esses dados para valores numéricos para podermos fazer cálculos em cima deles, podemos fazer isso utilizando a seguinte classe

In [4]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns 

    def fit(self,X,y=None):
        return self 

    def transform(self,X):
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [5]:
df_categorical = MultiColumnLabelEncoder(columns=df.columns).fit_transform(df)
#scaler = MinMaxScaler()
#scaler.fit(df_categorical.values)
#df_categorical_normal = scaler.transform(df_categorical.values)
df_categorical[:10]

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,quality
0,3,3,0,0,2,1,2
1,3,3,0,0,2,2,2
2,3,3,0,0,2,0,2
3,3,3,0,0,1,1,2
4,3,3,0,0,1,2,2
5,3,3,0,0,1,0,2
6,3,3,0,0,0,1,2
7,3,3,0,0,0,2,2
8,3,3,0,0,0,0,2
9,3,3,0,1,2,1,2


#### Separando os dados por classe

In [6]:
def separateByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

#### Agora precisamos da média e do desvio padrão

In [7]:
def mean(numbers):
    return sum(numbers)/float(len(numbers))
 
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)

#### Agora podemos unir todas essas funções e sumarizar o dataset por classes

In [8]:
def summarize(dataset):
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

In [9]:
def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    return summaries

#### Agora estamos prontos para fazer predições

#### Função para calcular a probabilidade gaussiana

In [10]:
def calculateProbability(x, mean, stdev):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * math.pow(stdev, 2))) * exponent

#### Com isso podemos calcular as probabilidades das classes

In [11]:
def calculateClassProbabilities(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= calculateProbability(x, mean, stdev)
    return probabilities

#### Agora finalmente podemos fazer a função de predição

In [12]:
def predict(summaries, inputVector):
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

In [13]:
def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

#### Função para dividir o dataset entre treino e teste

In [14]:
def splitDataset(dataset, splitRatio):
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet, copy]

In [15]:
[train,test] = splitDataset(df_categorical.values,0.7)

#### Função para calcular a acurácia

In [16]:
def getAccuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [17]:
summaries = summarizeByClass(train)
summaries

{0: [(1.4483985765124556, 1.1076064595066355),
  (1.398576512455516, 1.0812681552645593),
  (1.6334519572953736, 1.0711234949498842),
  (1.5053380782918149, 0.5008635146373359),
  (0.8790035587188612, 0.7925684310168004),
  (0.9181494661921709, 0.9984227520773886)],
 2: [(1.5435041716328963, 1.1826924592407242),
  (1.5756853396901074, 1.1668368911141813),
  (1.4600715137067939, 1.1270234498247351),
  (0.7842669845053635, 0.8361980131394481),
  (1.0524433849821215, 0.821614693955221),
  (1.0548271752085816, 0.7158935117845941)],
 1: [(1.2727272727272727, 0.45051063346696685),
  (1.3181818181818181, 0.47115529818794427),
  (1.7272727272727273, 1.0861418033172827),
  (1.5454545454545454, 0.5036862005093148),
  (0.9772727272727273, 0.8757358729884833),
  (1.1363636363636365, 1.0021119347706249)],
 3: [(1.3777777777777778, 0.4903101471559001),
  (1.1777777777777778, 0.747386017119573),
  (1.6666666666666667, 1.0871146130092182),
  (1.488888888888889, 0.5055250296034369),
  (0.35555555555555

In [18]:
predictions = getPredictions(summaries,test)

ZeroDivisionError: float division by zero