In [34]:
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, Imputer, MinMaxScaler
from sklearn.cross_validation import train_test_split
import pandas as pd
import math

In [35]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mammographic-masses/mammographic_masses.data"

nameCol = ['BI-RADS assessment', 'Age', 'Shape', 'Margin', 'Density', 'Severity']

dataset = pd.read_csv(url, names=nameCol)

In [36]:
#pré-processamento dos dados

In [37]:
type(dataset)


pandas.core.frame.DataFrame

In [38]:
dataset = dataset.apply(lambda x: x.replace('?',np.nan))

In [39]:
previsores = dataset.iloc[:,0:5].values
classe = dataset.iloc[:,5].values

In [40]:
imputer = Imputer(missing_values='NaN', strategy='most_frequent', axis = 0)
imputer = imputer.fit(previsores[:,0:5])
previsores[:,0:5] = imputer.transform(previsores[:,0:5])

In [41]:
previsores

array([[5.0, 67.0, 3.0, 5.0, 3.0],
       [4.0, 43.0, 1.0, 1.0, 3.0],
       [5.0, 58.0, 4.0, 5.0, 3.0],
       ...,
       [4.0, 64.0, 4.0, 5.0, 3.0],
       [5.0, 66.0, 4.0, 5.0, 3.0],
       [4.0, 62.0, 3.0, 3.0, 3.0]], dtype=object)

In [42]:
onehotencoder = OneHotEncoder(categorical_features=[3,4])

In [43]:
previsores = onehotencoder.fit_transform(previsores).toarray()

In [44]:
#escalonamento dos atributos

In [45]:
scaler = MinMaxScaler()

In [46]:
previsores = scaler.fit_transform(previsores)

In [47]:
previsores

array([[0.        , 0.        , 0.        , ..., 0.09090909, 0.62820513,
        0.66666667],
       [1.        , 0.        , 0.        , ..., 0.07272727, 0.32051282,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.09090909, 0.51282051,
        1.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.07272727, 0.58974359,
        1.        ],
       [0.        , 0.        , 0.        , ..., 0.09090909, 0.61538462,
        1.        ],
       [0.        , 0.        , 1.        , ..., 0.07272727, 0.56410256,
        0.66666667]])

In [48]:
#separação das bases

In [49]:
x_train, x_test, y_train, y_test = train_test_split(previsores, classe, test_size=0.25, random_state=0)

In [50]:
rows, columns = x_train.shape
temp = np.zeros((rows,columns+1))
temp[:,:-1] = x_train
temp[:,-1] = y_train

x_train = temp

In [51]:
rows2, columns2 = x_test.shape
temp2 = np.zeros((rows2,columns2+1))
temp2[:,:-1] = x_test
temp2[:,-1] = y_test

x_test = temp2

In [52]:
#implementação do knn

In [58]:
# Distância euclidiana
def euclidianDis(x1, x2):
    tam = len(x1) - 1
    soma = 0
    for i in range(tam):
        soma += math.pow(x1[i] - x2[i], 2)
    return math.sqrt(soma)

In [67]:
# algoritmo knn
def knn(train, instN, k):
    class0 = 0 
    class1 = 0
    nei = {}
    tamTrain = len(train)
    
    for i in range(tamTrain):
        e = euclidianDis(train[i], instN)
        nei[i] = e
    kNeibo = sorted(nei, key = nei.get)[:k]
    
    for j in kNeibo:
        if train[j][-1] == 0:
            class0 += 1
        elif train[j][-1] == 1:
            class1 += 1
    if class0 >= class1:
        return 0
    elif class0 < class1:
        return 1
    

In [68]:
acerto0 = 0
erro0 = 0
acerto1 = 0
erro1 = 0
k = 9

In [69]:
# continuação do programa
for sample in x_test:
    clas = knn(x_train, sample, k)
    if clas == 1:
        if sample[-1] == clas:
            acerto1 += 1
        else:
            erro1 += 1
    else:
        if sample[-1] == clas:
            acerto0 += 1
        else:
            erro0 += 1

In [70]:
#métricas
print('Taxa de acerto: %.2f%%' % (100 * (acerto1+acerto0) / len(x_test)))
print('Taxa de erro: %.2f%%' % (100 * (erro1+erro0) / len(x_test)))
print('Precisão - classe 0: %.2f%%' %(100 * acerto1/(acerto1+erro1)))
print('Precisão - classe 1: %.2f%%' %(100 * acerto0/(acerto0+erro0)))

Taxa de acerto: 77.18%
Taxa de erro: 22.82%
Precisão - classe 0: 74.36%
Precisão - classe 1: 79.84%
