In [1]:
%matplotlib widget
import matplotlib.pyplot as plt
from sklearn.neighbors import KDTree
from data_utils import load_dataset
from nearest import knn
import numpy as np

# Here I just extend the KNN regression class I have and add new methods
class knn_class(knn):
    def cl(self, x_train, y_train, x_test):
        """
        Classify - classify a bunch of data points from x_test, output are one hot encoded vectors like points of ytest
        x_train - model feature data, used for prediction
        y_train - model feature data, used for prediction
        x_test - test feature data, used for prediction
        
        """
        t = KDTree(x_train, metric = 'euclidean' if self.dist_type == 'l2' else 'manhattan')
        _, ind = t.query(x_test, k=self.k)
        out = []
        set_nn = y_train[ind]
        
        for i in range(set_nn.shape[0]):
            vote, count = np.unique(set_nn[i,:,:], axis=0,return_counts=True)
            out.append(vote[np.argmax(count)])
            
        
        return np.array(out)
    
    def cl_validation(self, xdata, ydata, xvalid, yvalid):
        """
        Perform classification validation with the validation set
        x_data - model feature data, used for prediction
        y_data - model feature data, used for prediction
        x_valid - valid feature data, used for prediction
        y_valid - valid target data, accuracy calculation
        
        """
        pred = self.cl(xdata, ydata, xvalid)
        count = np.sum(np.all(np.equal(pred, yvalid), axis = 1)) # find all classifications which are equal then sum

        score = count / yvalid.shape[0] # normalize by the number of points in y (i.e. if it is all correct you get 1.0)
        return score
    
    def cl_cv(self, xdata, ydata, folds):
        '''
        Run cross validation on set
        This is not used any where, I was just able to rip it from nearest.py
        This helped to give me some insight during some explorations
        '''
        score = 0
        prediction = []
        xfolds = np.array_split(xdata, folds, axis=0,)
        yfolds = np.array_split(ydata, folds, axis=0,)
        for i in range(folds):
                validx = xfolds[i]
                trainx = np.vstack([xfolds[j] for j in range(folds) if j != i])
                validy = yfolds[i]
                trainy = np.vstack([yfolds[j] for j in range(folds) if j != i])
                yhat = []
                count = 0
                for j in range(validx.shape[0]):
                    pred = self.cl(trainx, trainy, validx[j])
                    yhat.append(pred)
                    count += np.sum(np.equal(np.array(pred), validy[j]).all())
                yhat = np.array(yhat)
                score += count
        return score/ydata.shape[0]

    
    



# Iris data set

In [2]:
#l2 distance
# the exploration here is almost identical to Question 1
# I just run my validation tests, get an accuracy for a given k, then plot it and find the max k
# I also examine the distance metrics to determine which one I should use
# the one with the greater accuracy for a given k will be the choice
xtrain, xvalid, xtest, ytrain, yvalid, ytest = load_dataset('iris')
scores = []
predicted = []
folds = 100
#X = np.vstack([xvalid, xtrain ])
#Y = np.vstack([yvalid, ytrain] )
for k in range(1, folds):
    nn = knn_class(k=k)
    scores.append(nn.cl_validation(xtrain,ytrain, xvalid, yvalid))
    predicted.append(nn.pred)

fig = plt.figure()
plt.plot( [j for j in range(1, folds)], scores)
plt.xlabel("k")
plt.ylabel("Fraction of Correct Classifications")
plt.title("Correct Classifications compared to Knn")
plt.show()

[print(f'{j + 1}: Correct Classifications: {scores[j]}') for j in range(folds - 1)];

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

1: Correct Classifications: 0.7741935483870968
2: Correct Classifications: 0.8064516129032258
3: Correct Classifications: 0.8064516129032258
4: Correct Classifications: 0.8064516129032258
5: Correct Classifications: 0.8387096774193549
6: Correct Classifications: 0.8064516129032258
7: Correct Classifications: 0.8709677419354839
8: Correct Classifications: 0.8387096774193549
9: Correct Classifications: 0.8709677419354839
10: Correct Classifications: 0.8709677419354839
11: Correct Classifications: 0.8709677419354839
12: Correct Classifications: 0.8387096774193549
13: Correct Classifications: 0.9032258064516129
14: Correct Classifications: 0.8387096774193549
15: Correct Classifications: 0.8709677419354839
16: Correct Classifications: 0.9032258064516129
17: Correct Classifications: 0.8387096774193549
18: Correct Classifications: 0.8387096774193549
19: Correct Classifications: 0.8709677419354839
20: Correct Classifications: 0.8709677419354839
21: Correct Classifications: 0.8387096774193549
2

In [4]:
#l1 distance
xtrain, xvalid, xtest, ytrain, yvalid, ytest = load_dataset('iris')#, n_train=1000, d=2)
scores = []
predicted = []
folds = 100
#X = np.vstack([xvalid, xtrain ])
#Y = np.vstack([yvalid, ytrain] )
for k in range(1, folds):
    nn = knn_class(k=k, dist_type ='l1')
    scores.append(nn.cl_validation(xtrain,ytrain, xvalid, yvalid))
    predicted.append(nn.pred)

fig = plt.figure()
plt.plot( [j for j in range(1, folds)], scores)
plt.xlabel("k")
plt.ylabel("Fraction of Correct Classifications")
plt.title("Correct Classifications compared to Knn")
plt.show()

[print(f'{j + 1}: Correct Classifications: {scores[j]}') for j in range(folds - 1)];

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

1: Correct Classifications: 0.7741935483870968
2: Correct Classifications: 0.8064516129032258
3: Correct Classifications: 0.7741935483870968
4: Correct Classifications: 0.8064516129032258
5: Correct Classifications: 0.8064516129032258
6: Correct Classifications: 0.7741935483870968
7: Correct Classifications: 0.8064516129032258
8: Correct Classifications: 0.8064516129032258
9: Correct Classifications: 0.8387096774193549
10: Correct Classifications: 0.8387096774193549
11: Correct Classifications: 0.8709677419354839
12: Correct Classifications: 0.8709677419354839
13: Correct Classifications: 0.8709677419354839
14: Correct Classifications: 0.8709677419354839
15: Correct Classifications: 0.8387096774193549
16: Correct Classifications: 0.8387096774193549
17: Correct Classifications: 0.8064516129032258
18: Correct Classifications: 0.8387096774193549
19: Correct Classifications: 0.8387096774193549
20: Correct Classifications: 0.8387096774193549
21: Correct Classifications: 0.8387096774193549
2

In [5]:
# Test RMSE
xtrain, xvalid, xtest, ytrain, yvalid, ytest = load_dataset('iris')#, n_train=1000, d=2)

nn = knn_class(k=16, dist_type='l2')
nn.cl_validation(np.vstack([xtrain, xvalid]), np.vstack([ytrain, yvalid]), xtest ,ytest )

1.0

In [6]:
ytest.shape

(15, 3)

# Mnist

In [7]:
#l2 distance
xtrain, xvalid, xtest, ytrain, yvalid, ytest = load_dataset('mnist_small')#, n_train=1000, d=2)
scores = []
predicted = []
folds = [j for j in range(1, 50)]
#X = np.vstack([xvalid, xtrain ])
#Y = np.vstack([yvalid, ytrain] )
for k in folds:
    nn = knn_class(k=k)
    scores.append(nn.cl_validation(xtrain,ytrain, xvalid, yvalid))
    predicted.append(nn.pred)

fig = plt.figure()
plt.plot( folds, scores)
plt.xlabel("k")
plt.ylabel("Fraction of Correct Classifications")
plt.title("Correct Classifications compared to Knn")
plt.show()

[print(f'{folds[j]}: Correct Classifications: {scores[j]}') for j in range(len(folds))];

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

1: Correct Classifications: 0.95
2: Correct Classifications: 0.945
3: Correct Classifications: 0.949
4: Correct Classifications: 0.947
5: Correct Classifications: 0.946
6: Correct Classifications: 0.943
7: Correct Classifications: 0.949
8: Correct Classifications: 0.946
9: Correct Classifications: 0.943
10: Correct Classifications: 0.937
11: Correct Classifications: 0.94
12: Correct Classifications: 0.937
13: Correct Classifications: 0.935
14: Correct Classifications: 0.937
15: Correct Classifications: 0.933
16: Correct Classifications: 0.931
17: Correct Classifications: 0.932
18: Correct Classifications: 0.927
19: Correct Classifications: 0.927
20: Correct Classifications: 0.925
21: Correct Classifications: 0.926
22: Correct Classifications: 0.922
23: Correct Classifications: 0.922
24: Correct Classifications: 0.921
25: Correct Classifications: 0.917
26: Correct Classifications: 0.916
27: Correct Classifications: 0.917
28: Correct Classifications: 0.916
29: Correct Classifications: 0.

In [8]:
#l1 distance
xtrain, xvalid, xtest, ytrain, yvalid, ytest = load_dataset('mnist_small')#, n_train=1000, d=2)
scores = []
predicted = []
folds = [j for j in range(1, 50)]
#X = np.vstack([xvalid, xtrain ])
#Y = np.vstack([yvalid, ytrain] )
for k in folds:
    nn = knn_class(k=k, dist_type ='l1')
    scores.append(nn.cl_validation(xtrain,ytrain, xvalid, yvalid))
    predicted.append(nn.pred)

fig = plt.figure()
plt.plot( folds, scores)
plt.xlabel("k")
plt.ylabel("Fraction of Correct Classifications")
plt.title("Correct Classifications compared to Knn")
plt.show()

[print(f'{folds[j]}: Correct Classifications: {scores[j]}') for j in range(len(folds))];

KeyboardInterrupt: 

In [9]:
# Test RMSE
xtrain, xvalid, xtest, ytrain, yvalid, ytest = load_dataset('mnist_small')#, n_train=1000, d=2)
nn = knn_class(k=1, dist_type='l2')
nn.cl_validation(np.vstack([xtrain, xvalid]), np.vstack([ytrain, yvalid]), xtest ,ytest )

0.959