In [1]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int,
              'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 
              'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 
              'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [2]:
import pandas as pd
import numpy as np

In [3]:
def get_numpy_data(data_frame, features:list, output:str):
    lst = ['constant']
    data_frame['constant'] = 1
    lst.extend(features)
    feature_matrix = data_frame[lst].to_numpy()
    output = data_frame[output].to_numpy()
    return feature_matrix, output

In [4]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    new_features = features / norms
    return new_features, norms

In [5]:
feature_list = ['bedrooms',  
                'bathrooms',  
                'sqft_living',  
                'sqft_lot',  
                'floors',
                'waterfront',  
                'view',  
                'condition',  
                'grade',  
                'sqft_above',  
                'sqft_basement',
                'yr_built',  
                'yr_renovated',  
                'lat',  
                'long',  
                'sqft_living15',  
                'sqft_lot15']
training = pd.read_csv('kc_house_data_small_train.csv', dtype=dtype_dict)
validation = pd.read_csv('kc_house_data_validation.csv', dtype=dtype_dict)
test = pd.read_csv('kc_house_data_small_test.csv', dtype=dtype_dict)
train_features, train_output = get_numpy_data(training, feature_list, 'price')
valid_features, valid_output = get_numpy_data(validation, feature_list, 'price')
test_features, test_output = get_numpy_data(test, feature_list, 'price')
train_features, norms = normalize_features(train_features)
test_features = test_features / norms
valid_features = valid_features / norms

# Euclidean distance

In [8]:
print(test_features[0])
print(train_features[9])
print('distance=', np.sqrt(np.sum( (test_features[0] - train_features[9])**2 ) ) )

[ 0.01345102  0.01551285  0.01807473  0.01759212  0.00160518  0.017059
  0.          0.05102365  0.0116321   0.01564352  0.01362084  0.02481682
  0.01350306  0.          0.01345387 -0.01346922  0.01375926  0.0016225 ]
[ 0.01345102  0.01163464  0.00602491  0.0083488   0.00050756  0.01279425
  0.          0.          0.01938684  0.01390535  0.0096309   0.
  0.01302544  0.          0.01346821 -0.01346251  0.01195898  0.00156612]
distance= 0.05972359371398078


In [9]:
for i in range(10):
    print(i, np.sqrt(np.sum( (train_features[i] - test_features[0])**2)) )

0 0.06027470916295592
1 0.08546881147643746
2 0.06149946435279315
3 0.05340273979294363
4 0.05844484060170442
5 0.059879215098128345
6 0.05463140496775461
7 0.055431083236146074
8 0.052383627840220305
9 0.05972359371398078


In [10]:
results = train_features[0:3] - test_features[0]
for i in range(3):
    print( results[i] - (train_features[i] - test_features[0]) )

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


# 1-NN regression

In [11]:
diff = train_features - test_features[0]
print(diff[-1].sum())

-0.09343399874654643


In [12]:
print(np.sum(diff ** 2, axis=1)[15])
print(np.sum(diff[15]**2))

0.0033070590284564457
0.0033070590284564453


In [13]:
def compute_distances(features, features_query):
    diff = features - features_query
    dist = np.sqrt(np.sum(diff ** 2, axis=1))
    return dist

In [14]:
dist = compute_distances(train_features, test_features[2])
nnindex = min(range(len(dist)), key= lambda i:dist[i])
print(train_output[nnindex])
print(nnindex)

249000.0
382


# k-NN regression

In [15]:
def knn(k, train_feature_matrix, query_feature):
    dist = compute_distances(train_feature_matrix, query_feature)
    sorted_index = sorted(range(len(dist)), key= lambda i: dist[i])
    return sorted_index[0:k]

In [16]:
def predict_knn(k ,train_feature_matrix, train_output, query_feature):
    indices = knn(k, train_feature_matrix, query_feature)
    outputs = train_output[indices]
    return indices,np.mean(outputs)

In [19]:
predict_knn(4, train_features, train_output, test_features[2])

([382, 1149, 4087, 3142], 413987.5)

In [20]:
price_bar = []
for i in range(10):
    _,pr = predict_knn(10, train_features, train_output, test_features[i])
    price_bar.append(pr)
idx = min(range(10), key= lambda i:price_bar[i])
print(idx, price_bar[idx])

6 350032.0


# choose optimal k value

In [21]:
K = list(range(1,16))
rss = []
for k in K:
    pred_error = []
    for i in range(valid_features.shape[0]):
        _,pr = predict_knn(k, train_features, train_output, valid_features[i])
        pred_error.append(pr-valid_output[i])
    rss.append( np.sum(np.array(pred_error)**2))
    print(k, '%.4e'%rss[-1])

1 1.0545e+14
2 8.3445e+13
3 7.2692e+13
4 7.1935e+13
5 6.9847e+13
6 6.8903e+13
7 6.8338e+13
8 6.7362e+13
9 6.8373e+13
10 6.9334e+13
11 6.9524e+13
12 6.9052e+13
13 7.0011e+13
14 7.0912e+13
15 7.1109e+13


In [22]:
best_k = K[min(range(15), key=lambda i: rss[i])]
print(best_k)

8


In [23]:
pred_error = []
print(test_features.shape[0])
for i in range(test_features.shape[0]):
    _,pr = predict_knn(best_k, train_features, train_output, test_features[i])
    pred_error.append(pr- test_output[i])
rss = np.sum(np.array(pred_error)**2)
print(best_k, '%.4e'%rss)

1741
8 1.3309e+14
