In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [12]:
data = pd.read_csv('./kc_house_data_small.csv', dtype=dtype_dict)
train = pd.read_csv('./kc_house_data_small_train.csv', dtype=dtype_dict)
validate = pd.read_csv('kc_house_data_validation.csv', dtype=dtype_dict)
test = pd.read_csv('kc_house_data_small_test.csv', dtype=dtype_dict)

features = ['bedrooms',  
                'bathrooms',  
                'sqft_living',  
                'sqft_lot',  
                'floors',
                'waterfront',  
                'view',  
                'condition',  
                'grade',  
                'sqft_above',  
                'sqft_basement',
                'yr_built',  
                'yr_renovated',  
                'lat',  
                'long',  
                'sqft_living15',  
                'sqft_lot15']

output = 'price'

In [13]:
def get_numpy_data(data, features, output):
    data['constant'] = 1
    features = ['constant'] + features
    output_np = data[output]
    output_np = output_np.as_matrix()
    data_np = data.as_matrix(features)
    return (data_np, output_np)

In [14]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    normalized_features = features/norms
    return (normalized_features, norms)

In [26]:
train_np, train_out_np = get_numpy_data(train, features, output)
train_np, norms = normalize_features(train_np)
valid_np, valid_out_np = get_numpy_data(validate, features, output)
valid_np = valid_np/norms
test_np, test_out_np = get_numpy_data(test, features, output)
test_np = test_np/norms

In [27]:
print test_np[0]
print train_np[9]

[ 0.01345102  0.01551285  0.01807473  0.01759212  0.00160518  0.017059    0.
  0.05102365  0.0116321   0.01564352  0.01362084  0.02481682  0.01350306
  0.          0.01345387 -0.01346922  0.01375926  0.0016225 ]
[ 0.01345102  0.01163464  0.00602491  0.0083488   0.00050756  0.01279425
  0.          0.          0.01938684  0.01390535  0.0096309   0.
  0.01302544  0.          0.01346821 -0.01346251  0.01195898  0.00156612]


## Q1

In [28]:
print str(np.sqrt(np.sum((train_np[9] - test_np[0])**2)))

0.059723593714


## Q2

In [29]:
num_train_data = 10
dist = []
for i in range(num_train_data):
    dist.append(np.sqrt(np.sum((train_np[i] - test_np[0])**2)))
print str(dist.index(min(dist)))
print dist

8
[0.060274709162955922, 0.085468811476437465, 0.061499464352793153, 0.053402739792943632, 0.05844484060170442, 0.059879215098128345, 0.054631404967754607, 0.055431083236146074, 0.052383627840220305, 0.059723593713980783]


## Q3

In [30]:
# verify that vectorization works
results = train_np[0:3] - test_np[0]
print results[0] - (train_np[0]-test_np[0])
# should print all 0's if results[0] == (features_train[0]-features_test[0])
print results[1] - (train_np[1]-test_np[0])
# should print all 0's if results[1] == (features_train[1]-features_test[0])
print results[2] - (train_np[2]-test_np[0])
# should print all 0's if results[2] == (features_train[2]-features_test[0])

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


In [31]:
diff = train_np - test_np[0]
print diff[-1].sum()

-0.0934339987465


In [37]:
distances = np.sum(diff**2, axis=1)
distances = np.sqrt(distances)

In [36]:
print np.sum(diff**2, axis=1)[15]
print np.sum(diff[15]**2)

0.00330705902846
0.00330705902846


In [38]:
print distances[100]

0.0237082324167


In [39]:
def compute_distances(train_np, query):
    diff = train_np - query
    distances = np.sqrt(np.sum(diff**2, axis=1))
    return distances

In [40]:
compute_distances(train_np, test_np[0])[100]

0.023708232416678195

In [44]:
distances = compute_distances(train_np, test_np[2])
index = np.argmin(distances)
print index

382


In [45]:
print train_out_np[index]

249000.0


## Q4

In [50]:
def k_nearest_neighbors(k, feature_train, features_query):
    distances = compute_distances(feature_train, features_query)
    index_array = np.argsort(distances)
    neighbors = index_array[0:k]
    return neighbors

In [54]:
query = test_np[2]
k = 4
print k_nearest_neighbors(k, train_np, query)
print type(k_nearest_neighbors(k, train_np, query))
print k_nearest_neighbors(k, train_np, query).tolist()

[ 382 1149 4087 3142]
<type 'numpy.ndarray'>
[382, 1149, 4087, 3142]


In [57]:
def predict_output_of_query(k, features_train, output_train, features_query):
    neighbors = k_nearest_neighbors(k, features_train, features_query).tolist()
    prediction = np.mean(output_train[neighbors])
    return prediction

In [58]:
query = test_np[2]
k = 4
print predict_output_of_query(k, train_np, train_out_np, query)

413987.5


## Q5

In [61]:
query = test_np[0:10]
k = 10
predictions = []
for i in range(k):
    predictions.append(predict_output_of_query(k, train_np, train_out_np, query[i]))

print predictions.index(min(predictions))
print predictions
print min(predictions)

6
[881300.0, 431860.0, 460595.0, 430200.0, 766750.0, 667420.0, 350032.0, 512800.70000000001, 484000.0, 457235.0]
350032.0


## Q6

In [79]:
k_set = [i for i in range(1, 16)]

num_valid = valid_np.shape[0]
rss = []

for k in k_set:
    rss.append(0.)
    for i in range(num_valid):
        diff = valid_out_np[i] - predict_output_of_query(k, train_np, train_out_np, valid_np[i])
        rss[-1] = rss[-1] + diff**2

In [81]:
print k_set
print rss
best_k = k_set[rss.index(min(rss))]
print best_k

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
[105453830251561.0, 83445073504025.5, 72692096019202.797, 71946721652091.688, 69846517419718.586, 68899544353181.094, 68341973450051.055, 67361678735491.5, 68372727958976.336, 69335048668556.703, 69523855215598.875, 69049969587246.453, 70011254508263.625, 70908698869034.438, 71106928385945.359]
8


In [82]:
num_test = test_np.shape[0]
test_rss = 0.
for i in range(num_test):
    diff = test_out_np[i] - predict_output_of_query(best_k, train_np, train_out_np, test_np[i])
    test_rss = test_rss + diff**2
print test_rss

1.33118823552e+14
