In [1]:
import numpy as np
import pandas
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# Load dataset

In [14]:
X, Y = load_digits(return_X_y = True)
classes = load_digits().target_names
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size = 0.30, random_state = 0)

In [20]:
print(X.shape, X, sep = '\n')
print()
print(Y.shape, Y, sep = '\n')

(1797, 64)
[[ 0.  0.  5. ...  0.  0.  0.]
 [ 0.  0.  0. ... 10.  0.  0.]
 [ 0.  0.  0. ... 16.  9.  0.]
 ...
 [ 0.  0.  1. ...  6.  0.  0.]
 [ 0.  0.  2. ... 12.  0.  0.]
 [ 0.  0. 10. ... 12.  1.  0.]]

(1797,)
[0 1 2 ... 8 9 8]


# K-Neighbors Classifier

## SciKitLearn KNN

In [104]:
model = KNeighborsClassifier()
model.fit(Xtrain, Ytrain)
Ypred = model.predict(Xtest)
print('Accuracy', accuracy_score(Ytest, Ypred))
m = confusion_matrix(Ytest, Ypred)
df = pandas.DataFrame(m, index = classes, columns = classes)
df

Accuracy 0.9814814814814815


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,45,0,0,0,0,0,0,0,0,0
1,0,51,0,0,0,1,0,0,0,0
2,0,0,52,0,0,0,0,1,0,0
3,0,0,1,53,0,0,0,0,0,0
4,0,0,0,0,47,0,0,1,0,0
5,0,0,0,0,0,55,1,0,0,1
6,0,0,0,0,0,0,60,0,0,0
7,0,0,0,0,0,0,0,53,0,0
8,0,1,0,1,0,0,1,0,58,0
9,0,0,0,0,0,1,0,0,0,56


## My KNN

In [105]:
class myKNeighborsClassifier:
    def __init__(self, k = 5):
        self.k = k
        self.Xtrain = None
        self.Ytrain = None
    def fit(self, Xtrain, Ytrain):
        self.Xtrain = Xtrain
        self.Ytrain = Ytrain
    def predict(self, X):
        pred = np.zeros(len(X))
        for iX, x in enumerate(X):
            
            #1. Calcular la distancia de los datos X a los datos Xtrain
            results = []
            for xT in Xtrain:
                d = np.abs(xT - x)
                results.append(np.mean(d))            
                
            #2. Ordenar las distancias de menor a mayor
            results = np.argsort(results)
            
            #3. Calcular la clase de acuerdo a la mayoría de los k vecinos más cercanos
            results = Ytrain[results]
            countKNN = np.zeros(len(np.unique(Ytest)))
            for i in results:
                countKNN[i] += 1
                if countKNN[i] == 5:
                    pred[iX] = i
                    break
                    
        #4. Regresar valores
        return pred

In [106]:
model = myKNeighborsClassifier()
model.fit(Xtrain, Ytrain)
Ypred = model.predict(Xtest)
print('Accuracy', accuracy_score(Ytest, Ypred))
m = confusion_matrix(Ytest, Ypred)
df = pandas.DataFrame(m, index = classes, columns = classes)
df

Accuracy 0.9703703703703703


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,45,0,0,0,0,0,0,0,0,0
1,0,52,0,0,0,0,0,0,0,0
2,0,1,51,0,0,0,0,1,0,0
3,0,0,1,51,0,0,0,0,2,0
4,0,0,0,0,47,0,0,1,0,0
5,0,0,0,0,0,54,1,0,0,2
6,0,0,0,0,0,0,60,0,0,0
7,0,0,0,0,0,0,0,53,0,0
8,0,3,0,2,0,0,0,1,55,0
9,0,0,0,0,0,1,0,0,0,56


# Nearest Centroid Classifier

## SciKitLearn NC

In [99]:
model = NearestCentroid()
model.fit(Xtrain, Ytrain)
Ypred = model.predict(Xtest)
print('Accuracy', accuracy_score(Ytest, Ypred))
m = confusion_matrix(Ytest, Ypred)
df = pandas.DataFrame(m, index = classes, columns = classes)
df

Accuracy 0.8925925925925926


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,45,0,0,0,0,0,0,0,0,0
1,0,42,1,0,0,1,1,0,1,6
2,1,2,44,3,0,0,0,2,0,1
3,0,0,1,44,0,0,0,2,2,5
4,0,1,0,0,44,0,0,3,0,0
5,0,0,0,0,0,47,1,0,0,9
6,0,1,0,0,0,0,59,0,0,0
7,0,0,0,0,0,0,0,52,1,0
8,0,3,0,1,0,1,0,1,51,4
9,0,0,0,0,0,1,0,2,0,54


## My NC

In [187]:
class myNearestCentroid:
    def __init__(self):
        self.centroids = None
    def fit(self, Xtrain, Ytrain):

        # 1. calcular el número de clases
        nClasses = len(np.unique(Ytrain))
        
        # 2. calcular los centroides de cada clase y guardarlo en self.centroids
        self.centroids = np.zeros([nClasses, Xtrain.shape[1]])
        for i in range(nClasses):
            iTrain = Xtrain[Ytrain == i]
            self.centroids[i, :] = np.sum(iTrain, axis = 0)/len(iTrain)
    def predict(self,X):
        pred = np.zeros(len(X))
        # 1. Calcular la distancia de cada muestra en X a cada uno de los centroides
        for i, x in enumerate(X):
            distances = []
            for c in self.centroids:
                d = np.abs(c - x)
                d = np.sum(d)
                distances.append(d)
        
            # 2. asignar la clase de acuerdo al centroide más cercano
            distances = np.argsort(distances)
            pred[i] = distances[0]
        return pred

In [189]:
model = myNearestCentroid()
model.fit(Xtrain, Ytrain)
Ypred = model.predict(Xtest)
print('Accuracy', accuracy_score(Ytest, Ypred))
m = confusion_matrix(Ytest, Ypred)
df = pandas.DataFrame(m, index = classes, columns = classes)
df

Accuracy 0.8740740740740741


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,44,0,0,0,0,0,1,0,0,0
1,0,43,1,0,0,1,2,0,0,5
2,1,3,43,4,0,0,0,2,0,0
3,0,0,1,46,0,0,0,2,2,3
4,0,1,0,0,43,0,0,4,0,0
5,0,0,0,1,0,46,3,0,0,7
6,0,1,0,0,0,0,59,0,0,0
7,0,0,0,0,1,0,0,51,1,0
8,1,5,0,2,0,1,0,1,46,5
9,0,0,0,3,0,1,0,2,0,51
