# K-Nearest Neighbors

In [9]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [20]:
df = pd.read_csv('./Occupancy_Estimation.csv')
feature_data = df.drop(columns=["Room_Occupancy_Count", "Date", "Time"])
X_train, X_test, Y_train, Y_test = train_test_split(feature_data, df["Room_Occupancy_Count"], test_size=0.2)

[0 3 0 ... 0 0 0]


In [22]:
class KNN:
    """
    Class to store data for regression problems 
    """
    def __init__(self, x_train, y_train, K=5):
        """
        Creates a kNN instance

        :param x_train: numpy array with shape (n_rows,1)- e.g. [[1,2],[3,4]]
        :param y_train: numpy array with shape (n_rows,)- e.g. [1,-1]
        :param K: The number of nearest points to consider in classification
        """
        
        # Import and build the BallTree on training features 
        from sklearn.neighbors import BallTree
        self.balltree = BallTree(x_train)
        
        # Cache training labels and parameter K 
        self.y_train = y_train
        self.K = K 
        
        
    def majority(self, neighbor_indices, neighbor_distances=None):
        """
        Given indices of nearest neighbors in training set, return the majority label. 
        Break ties by considering 1 fewer neighbor until a clear winner is found. 

        :param neighbor_indices: The indices of the K nearest neighbors in self.X_train 
        :param neighbor_distances: Corresponding distances from query point to K nearest neighbors. 
        """
        
        while(True):
            label_count = {}
            
            # TODO - Review optimization
            for i in neighbor_indices:
                label = self.y_train[i]
                if label in label_count:
                    label_count[label] += 1
                else:
                    label_count[label] = 1
            
            max_value = max(label_count.values())
            max_key = max(label_count, key=label_count.get)
            
            # TODO - Review optimization
            n_modes = 0
            for x in label_count:
                if(x == max_value):
                    n_modes += 1
            
            if(n_modes > 1):
                furthest_neighbor = neighbor_distances.index(max(neighbor_distances))
                del neighbor_indices[furthest_neighbor]
                del neighbor_distances[furthest_neighbor]
            else:
                return max_key
            
        
    def classify(self, x):
        """
        Given a query point, return the predicted label 
        
        :param x: a query point stored as an ndarray  
        """
        neighbor_distances, neighbor_indices = self.balltree.query(x.reshape(1, -1), k=self.K)
        return self.majority(neighbor_indices[0], neighbor_distances[0])        
        
    def predict(self, X):
        """
        Given an ndarray of query points, return yhat, an ndarray of predictions 

        :param X: an (m x p) dimension ndarray of points to predict labels for 
        """
        # your code here
        
        yhat = []
        
        for item in X:
            yhat.append(self.classify(item))
            
        return yhat

In [29]:
knn = KNN(X_train.values, Y_train.values, K=3)
val_yhat = knn.predict(X_test.values)
c_matrix = sklearn.metrics.confusion_matrix(Y_test.values, val_yhat)

acc = []
for i in range(len(c_matrix)):
    acc.append(c_matrix[i,i]/c_matrix[i].sum()) 
print(acc)
print(c_matrix)


[0.9993823347745522, 0.9893617021276596, 0.9743589743589743, 0.9936305732484076]
[[1618    1    0    0]
 [   0   93    1    0]
 [   0    1  152    3]
 [   0    0    1  156]]


The values most confused are: