In [33]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
import math

In [34]:
sales = pd.read_csv('kc_house_data_small.csv')
#train_data , test_data = train_test_split(sales, test_size=0.2)
train_data = pd.read_csv('kc_house_data_small_train.csv')
test_data = pd.read_csv('kc_house_data_small_test.csv')
cv_data = pd.read_csv('kc_house_data_validation.csv')

In [35]:
def get_numpy_data(data_frame, features, output):
    data_frame['constant'] = 1 # this is how you add a constant column to an SFrame
    # add the column 'constant' to the front of the features list so that we can extract it along with the others:
    features = ['constant'] + features # this is how you combine two lists
    # select the columns of data_SFrame given by the features list into the SFrame features_sframe (now including constant):
    features_frame = data_frame[features]
    # the following line will convert the features_SFrame into a numpy matrix:
    feature_matrix = features_frame.as_matrix()
    # assign the column of data_sframe associated with the output to the SArray output_sarray
    output_array = data_frame[output]
    # the following will convert the SArray into a numpy array by first converting it to a list
    output_array = output_array.as_matrix()
    return(feature_matrix, output_array)

In [36]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    normalized_features = features / norms
    return (normalized_features, norms)

In [37]:
feature_list = ['bedrooms',  
                'bathrooms',  
                'sqft_living',  
                'sqft_lot',  
                'floors',
                'waterfront',  
                'view',  
                'condition',  
                'grade',  
                'sqft_above',  
                'sqft_basement',
                'yr_built',  
                'yr_renovated',  
                'lat',  
                'long',  
                'sqft_living15',  
                'sqft_lot15']
features_train, output_train = get_numpy_data(train_data, feature_list, 'price')
features_test, output_test = get_numpy_data(test_data, feature_list, 'price')
features_valid, output_valid = get_numpy_data(cv_data, feature_list, 'price')

In [38]:
features_train, norms = normalize_features(features_train)
features_test = features_test / norms
features_valid = features_valid / norms

In [39]:
print (features_test[0])
print (features_train[9])

[ 0.01345102  0.01551285  0.01807473  0.01759212  0.00160518  0.017059    0.
  0.05102365  0.0116321   0.01564352  0.01362084  0.02481682  0.01350306
  0.          0.01345387 -0.01346922  0.01375926  0.0016225 ]
[ 0.01345102  0.01163464  0.00602491  0.0083488   0.00050756  0.01279425
  0.          0.          0.01938684  0.01390535  0.0096309   0.
  0.01302544  0.          0.01346821 -0.01346251  0.01195898  0.00156612]


In [40]:
x = (features_test[0] - features_train[9])**2
euc_dist = math.sqrt(np.sum(x))
print(euc_dist)

0.05972359371398078


In [41]:
for i in range(10) :
    x = (features_test[0] - features_train[i])**2
    euc_dist = math.sqrt(np.sum(x))
    print(euc_dist)

0.06027470916295592
0.08546881147643746
0.06149946435279315
0.05340273979294363
0.05844484060170442
0.059879215098128345
0.05463140496775461
0.055431083236146074
0.052383627840220305
0.05972359371398078


In [42]:
x = (features_train[0:10] - features_test[0])**2
euc_dist = np.sqrt(np.sum(x, axis=1))
print(euc_dist)

[ 0.06027471  0.08546881  0.06149946  0.05340274  0.05844484  0.05987922
  0.0546314   0.05543108  0.05238363  0.05972359]


In [43]:
def compute_distances(features_instances, features_query):
    x = (features_instances[:] - features_query)**2
    distances = np.sqrt(np.sum(x,axis=1))
    return distances

In [44]:
distances = compute_distances(features_train , features_test[2])
min = distances[0]
min_index=0
for i in range(len(distances)) :
    if distances[i] < min :
        min = distances[i]
        min_index=i
print(min, min_index)

0.00286049555751 382


In [45]:
print(output_train[382])

249000


In [57]:
def k_nearest_neighbors(k, feature_train, features_query):
    x = (features_train[:] - features_query)**2
    distances = np.sqrt(np.sum(x,axis=1))
    distance_tuple = []
    for i in range(len(distances)) :
        distance_tuple.append((i, distances[i]))
    distance_tuple.sort(key = lambda value : value[1])
    return distance_tuple[0:k]

In [58]:
output = k_nearest_neighbors(4, features_train, features_test[2])
print(output)

[(382, 0.0028604955575117085), (1149, 0.0032258402701799303), (4087, 0.0035021563333717835), (3142, 0.0035931538334055863)]


In [59]:
def predict_output_of_query(k, features_train, output_train, features_query):
    output = k_nearest_neighbors(k, features_train, features_query)
    prediction = 0.0
    for i in range(k) :
        prediction += output_train[output[i][0]]
    prediction /= k
    return prediction

In [60]:
prediction = predict_output_of_query(4, features_train, output_train, features_test[2])
print(prediction)

413987.5


In [61]:
def predict_output(k, features_train, output_train, features_query):
    predictions=[]
    for i in range(len(features_query)) :
        predictions.append(predict_output_of_query(k, features_train, output_train, features_query[i, :]))
    return predictions

In [62]:
predictions = predict_output(10, features_train, output_train, features_test[0:10, :])
print(predictions)

[881300.0, 431860.0, 460595.0, 430200.0, 766750.0, 667420.0, 350032.0, 512800.70000000001, 484000.0, 457235.0]


In [64]:
np.min(predictions)

350032.0

In [65]:
for k in range(1,16) :
    predictions = predict_output(k, features_train, output_train, features_valid)
    RSS = sum((predictions-output_valid)*(predictions-output_valid))
    print(RSS)

1.05451197752e+14
8.3445073504e+13
7.26920960192e+13
7.19348033496e+13
6.98465174197e+13
6.8903104922e+13
6.83383144186e+13
6.73616787355e+13
6.8372727959e+13
6.93335579816e+13
6.95238552156e+13
6.90519486845e+13
7.00112545083e+13
7.09115306803e+13
7.11087974867e+13


In [68]:
test_predictions = predict_output(8, features_train, output_train, features_test)
RSS = sum((test_predictions-output_test)**2)
print(RSS)

1.33091689367e+14
