In [None]:
#KNN Regressor construction

"""KDTree will be used here it stores data in easily queriable fashion based on distance. We will use the Euclidean distance 
metric"""

import random
import sys

import pandas as pd
import numpy as np
from scipy.spatial import KDTree
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt

sys.setrecursionlimit(10000)  #we set it higher since KDTree will recurse and throw an error otherwise


In [None]:
class Regression(object):
    """
    performs KNN regression
    """
    
    def __init__(self):
        self.k = 5
        self.metric = np.mean
        self.kdtree = None
        self.houses = None
        self.values = None
        
    def set_data(self, houses, values):
        """
        sets houses and values data
        :param houses: pandas.Dataframe with houses parameters
        :param values: pandas.Series with houses values
        """
        self.houses = None
        self.values = values
        self.kdtree = KDTree(self.houses)
        
    def regress(self, query_point):
        """
        calculates predicted value for house with particular parameters
        :param query_point: pandas.Series with house parameters
        :return: house value
        """
        _,indexes = self.kdtree.query(query_point,self.k)
        value = self.metric(self.values.iloc[indexes])
        if np.isnan(value):
            raise Exception('Unexpected result')
        else:
            return value
        """here we are querrying the KDTree to find the closest K houses, we then use the metric, in this case
        mean to calculate a regression value"""
        
        

In [None]:
#KNN TESTING
"""We have build a reasonable KNN tool but we dont know how well it perfomes. We want to determine this using
Cross-validation; this include the following
: Take a training set and split it into two categories:testing and training
:Use the training data to train the model
: Use the testing data to determine how well the model performs"""

class RegressionTest(object):
    #init
    #load_csv file
    #tests
    #test_regression
    """Take in king county housing data, calculate and plot
    the kNN regression error rate"""
    
    def __init__(self):
        self.houses = None
        self.values = None
        
    def load_csv_file(self, csv_file, limit=None):
        """
        Loads CSV file with houses data
        :param csv_file:CSV file name
        :param limit: number of rows of file to read
        """
        houses = pd.read_csv(csv_file, nrows=limit)
        self.values = houses['AppraisedValue']
        houses =  houses.drop('AppraisedValue', 1)
        houses = (houses * houses.mean()) / (houses.max() * houses.min())
        self.houses = houses
        self.houses = self.houses[['lat','long','SqFtLot']]
        
    def tests(self, folds):
        """
        Calculates mean absolute errors for series of tests
        :param folds: how many times split the data
        :return: list of error values
        """
    
    def plot_error_rates(self):
        """
        plots MAE vs #folds
        """
        folds_range = range(2, 11)
        errors_df = pd.Dataframe({'max': 0, 'min': 0}, index=folds_range)
        plt.xlabel('#folds_range')
        plt.ylabel('MAE')
        plt.show()
    

In [None]:
#Running
def main():
    regression_test = RegressionTest()
    regression_test.load_csv_file('king_county_data_geocoded.csv', 100)
    regression_test.plot_error_rates()
    
    
if __name__ == '__main__':
    main()