In this notebook, you will implement k-nearest neighbors regression. You will:

- Find the k-nearest neighbors of a given query input

- Predict the output for the query input using the k-nearest neighbors
- Choose the best value of k using a validation set

In [14]:
import pandas as pd
import numpy as np

In [15]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [16]:
sales=pd.read_csv('kc_house_data_small.csv',dtype=dtype_dict)
train=pd.read_csv('kc_house_data_small_train.csv',dtype=dtype_dict)
test=pd.read_csv('kc_house_data_small_test.csv',dtype=dtype_dict)
valid=pd.read_csv('kc_house_data_validation.csv',dtype=dtype_dict)

In [17]:
def get_data(data_frame, features, output):
    '''args: 
    data_frame= array, dataframe alike
    features: list-like []
    output: str'''
    
    data_frame["constant"]=1
    features= ["constant"] + features
    
    features_frame= data_frame[features]
    feature_matrix= np.array(features_frame)
    
    output_array = np.array(data_frame[output])
    return(feature_matrix, output_array)

In [18]:
def normalize_features(feature_matrix):
    norms=np.linalg.norm(feature_matrix, axis=0)
    X_normalized = feature_matrix/norms
    return X_normalized, norms

In [19]:
def table_weights(features_list, weights):
    featuress=list(features_list)

    featuress.insert(0,'constant')
    df=pd.DataFrame(data={'Features':featuress, 'Weights':weights})
    return df

In [20]:
feature_list = ['bedrooms',  
                'bathrooms',  
                'sqft_living',  
                'sqft_lot',  
                'floors',
                'waterfront',  
                'view',  
                'condition',  
                'grade',  
                'sqft_above',  
                'sqft_basement',
                'yr_built',  
                'yr_renovated',  
                'lat',  
                'long',  
                'sqft_living15',  
                'sqft_lot15']
target = 'price'

In [21]:
X_train,y_train = get_data(train,feature_list,target)
X_test,y_test = get_data(test,feature_list,target)
X_valid,y_valid = get_data(valid,feature_list,target)

In [22]:
#features in test and valid must be divideb by norms from training set

X_train_norm, norms = normalize_features(X_train)


In [23]:
X_test_norm = X_test/norms
X_valid_norm = X_valid/norms

In [24]:
# compute single distance

print(X_test_norm[0])

[ 0.01345102  0.01551285  0.01807473  0.01759212  0.00160518  0.017059
  0.          0.05102365  0.0116321   0.01564352  0.01362084  0.02481682
  0.01350306  0.          0.01345387 -0.01346922  0.01375926  0.0016225 ]


In [25]:
X_train_norm[9]

array([ 0.01345102,  0.01163464,  0.00602491,  0.0083488 ,  0.00050756,
        0.01279425,  0.        ,  0.        ,  0.01938684,  0.01390535,
        0.0096309 ,  0.        ,  0.01302544,  0.        ,  0.01346821,
       -0.01346251,  0.01195898,  0.00156612])

In [36]:
queryH=X_test_norm[0]
trainH=X_train_norm[9]

distance= np.sqrt( np.sum( (queryH - trainH)**2))
di = np.sqrt( np.sum( (trainH - queryH)**2))

In [37]:

round(distance,3)

0.06

In [38]:
# distance queryH to first 10 train houses

In [105]:
queryH=X_test_norm[0]
trainH=X_train_norm[0:10]

distance_list = []
for house in trainH:
    distance= np.sqrt( np.sum( (queryH - house)**2))
    distance_list.append(round(distance,3))
min((v,i) for i,v in enumerate(distance_list))
    

(0.052, 8)

In [67]:
distance_list

[0.06027470916295592,
 0.08546881147643746,
 0.06149946435279315,
 0.05340273979294363,
 0.05844484060170442,
 0.059879215098128345,
 0.05463140496775461,
 0.055431083236146074,
 0.052383627840220305,
 0.05972359371398078]

In [51]:
print(distance_list.index(min(distance_list)))

8


In [61]:
print(min(range(len(distance_list)), key=distance_list.__getitem__))

8


In [62]:
np.argmin(distance_list)

8

In [65]:
min((v,i) for i,v in enumerate(distance_list))

(0.052383627840220305, 8)

In [94]:
#vectorize for loop for mutliple points
diff= X_train_norm - queryH
distances=np.sqrt( np.sum(diff**2, axis=1))

In [100]:
distances[100]

0.023708232416678195

In [115]:
def compute_distances(features_instances, features_query, show_min_dist=False):
    diff=features_instances - features_query
    distances = np.sqrt( np.sum(diff**2, axis=1))
    
    if show_min_dist==True:
        min_dist=min((v,i) for i,v in enumerate(distances))
        print('The closest point for this query has index {} and the distance is: {:.3e}'.format(min_dist[1], min_dist[0]))
    return distances

In [117]:
third_hosue=compute_distances(X_train_norm, X_test_norm[2], True)

The closest point for this query has index 382 and the distance is: 2.860e-03


In [119]:
#what is predicted price for this house
y_train[382]


249000.0

In [120]:
# Perform k-nearest neighbor regression


In [121]:
def k_NN(k, feature_train, features_query):
    
    distances = compute_distances(feature_train, features_query, show_min_dist=False)
    NN=np.argsort(distances, axis=0)[:k]
    return NN

In [122]:
third_house_k=k_NN(4, X_train_norm, X_test_norm[2])
third_house_k

array([ 382, 1149, 4087, 3142], dtype=int64)

In [132]:
foo=k_NN(4, X_train_norm, X_test_norm)

ValueError: operands could not be broadcast together with shapes (5527,18) (1741,18) 

In [125]:
np.average(y_train[third_house_k])

413987.5

In [123]:
# function that predicts the value of given query

In [126]:
def predict_y_query(k, X_train, y_train, query):
    
    indices=k_NN(k, X_train, query)
    predict_avg = np.average(y_train[indices])
    return predict_avg

In [129]:
pred_3rd=predict_y_query(4, X_train_norm, y_train, X_test_norm[2] )
round(pred_3rd,0)

413988.0

In [130]:
# function to predict the value of each and every house in a query set

In [135]:
def predict_y(k, X_train, y_train, query_set):
    
    rows = query_set.shape[0]
    y_predicted = []
    
    for i in range(rows):
        avg_prediction = predict_y_query(k, X_train, y_train, query_set[i])
        y_predicted.append(avg_prediction)
    return y_predicted

In [144]:
t=predict_y(10, X_train_norm, y_train, X_test_norm[:10])

In [147]:
min_pred=min((v,i) for i,v in enumerate(t))
min_pred


(350032.0, 6)

In [142]:
def predict_vectorized(k, X_train, y_train, query_set):
    predict_y=predict_y_query(k,X_train,y_train, query_set)
    y_predicted=np.apply_along_axis(predict_y,0,query_set)
    return y_predicted

In [143]:
tt= predict_vectorized(10, X_train_norm, y_train, X_test_norm)

ValueError: operands could not be broadcast together with shapes (5527,18) (1741,18) 

In [148]:
# Choosing the best value of k using a validation set


In [155]:
k_val=np.arange(1,16)

rss=[]
for k in k_val:
    pred_validation=predict_y(k, X_train_norm, y_train,X_valid_norm)
    rss_= np.sum( (y_valid - pred_validation)**2 )
    rss.append(rss_)

In [156]:
min((v,i) for i,v in enumerate(rss))

(67361678735491.5, 7)

In [158]:
# use best k on test data and calc rss

In [159]:
best_k=min((v,i) for i,v in enumerate(rss))[1]

In [162]:
best_prediction = predict_y(best_k, X_train_norm, y_train, X_test_norm)
rss = np.sum((y_test - best_prediction)**2)
print('best rss is: {:.2e}'.format(rss))

best rss is: 1.32e+14
