In [1]:
import numpy as np
from math import sqrt, log
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import turicreate as tc
from sklearn import linear_model

In [2]:
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1 # add a constant column to an SFrame
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    # select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’
    features_sframe = data_sframe[features]
    # this will convert the features_sframe into a numpy matrix:
    features_matrix = features_sframe.to_numpy()
    # assign the column of data_sframe associated with the target to the variable ‘output_sarray’
    output_sarray = data_sframe[output]
    # this will convert the SArray into a numpy array:
    output_array = output_sarray.to_numpy()
    return(features_matrix, output_array)

def predict_output(feature_matrix, weights):
    predictions = np.matmul(feature_matrix, weights)
    return(predictions)

def feature_derivative(errors, feature):
    derivative = -2 * np.dot(feature, errors)
    return(derivative)

def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    normalized_features = features/norms
    return (normalized_features, norms)

In [5]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = tc.SFrame('m_ef92e6258b8f7992.frame_idx')
(train_and_validation, test) = sales.random_split(.8, seed=1)
(train, validation) = train_and_validation.random_split(.8, seed=1)

feature_list = ['bedrooms',  
                'bathrooms',  
                'sqft_living',  
                'sqft_lot',  
                'floors',
                'waterfront',  
                'view',  
                'condition',  
                'grade',  
                'sqft_above',  
                'sqft_basement',
                'yr_built',  
                'yr_renovated',  
                'lat',  
                'long',  
                'sqft_living15',  
                'sqft_lot15']
features_train, output_train = get_numpy_data(train, feature_list, 'price')
features_test, output_test = get_numpy_data(test, feature_list, 'price')
features_valid, output_valid = get_numpy_data(validation, feature_list, 'price')

features_train, norms = normalize_features(features_train)
features_test = features_test / norms
features_valid = features_valid / norms

In [7]:
print(features_test[0])
print (features_train[9])

[ 0.01345102  0.01551285  0.01807473  0.01759212  0.00160518  0.017059
  0.          0.05102365  0.0116321   0.01564352  0.01362084  0.02481682
  0.01350306  0.          0.01345386 -0.01346927  0.01375926  0.0016225 ]
[ 0.01345102  0.01163464  0.00602491  0.0083488   0.00050756  0.01279425
  0.          0.          0.01938684  0.01390535  0.0096309   0.
  0.01302544  0.          0.01346821 -0.01346254  0.01195898  0.00156612]


In [10]:
euclidian_distance = np.sqrt(np.sum(np.square(features_test[9] - features_test[0])))
print(euclidian_distance)

0.058352853645149


In [12]:
dist_dict = {}
for i in range(0,10):
    dist_dict[i] = np.sqrt(np.sum((features_train[i] - features_test[0])**2))
    print (i, np.sqrt(np.sum((features_train[i] - features_test[0])**2)))
    
print (min(dist_dict.items(), key=lambda x: x[1])) 

0 0.0602747091729555
1 0.08546881148827083
2 0.06149946437120284
3 0.05340273978820058
4 0.05844484063938139
5 0.05987921510184001
6 0.05463140497261526
7 0.05543108324159792
8 0.05238362784097273
9 0.05972359371666126
(8, 0.05238362784097273)


In [18]:
for i in range(3):
    print (features_train[i]-features_test[0])

[ 0.00000000e+00 -3.87821276e-03 -1.20498190e-02 -1.05552733e-02
  2.08673616e-04 -8.52950206e-03  0.00000000e+00 -5.10236549e-02
  0.00000000e+00 -3.47633726e-03 -5.50336860e-03 -2.48168183e-02
 -1.63756198e-04  0.00000000e+00 -1.70072004e-05  1.30577772e-05
 -5.14364795e-03  6.69281453e-04]
[ 0.00000000e+00 -3.87821276e-03 -4.51868214e-03 -2.26610387e-03
  7.19763456e-04  0.00000000e+00  0.00000000e+00 -5.10236549e-02
  0.00000000e+00 -3.47633726e-03  1.30705004e-03 -1.45830788e-02
 -1.91048898e-04  6.65082271e-02  4.23240653e-05  6.22415897e-06
 -2.89330197e-03  1.47606982e-03]
[ 0.00000000e+00 -7.75642553e-03 -1.20498190e-02 -1.30002801e-02
  1.60518166e-03 -8.52950206e-03  0.00000000e+00 -5.10236549e-02
  0.00000000e+00 -5.21450589e-03 -8.32384500e-03 -2.48168183e-02
 -3.13866046e-04  0.00000000e+00  4.71047219e-05  1.56530415e-05
  3.72914476e-03  1.64764925e-03]


In [19]:
print (features_train[0:3] - features_test[0])

[[ 0.00000000e+00 -3.87821276e-03 -1.20498190e-02 -1.05552733e-02
   2.08673616e-04 -8.52950206e-03  0.00000000e+00 -5.10236549e-02
   0.00000000e+00 -3.47633726e-03 -5.50336860e-03 -2.48168183e-02
  -1.63756198e-04  0.00000000e+00 -1.70072004e-05  1.30577772e-05
  -5.14364795e-03  6.69281453e-04]
 [ 0.00000000e+00 -3.87821276e-03 -4.51868214e-03 -2.26610387e-03
   7.19763456e-04  0.00000000e+00  0.00000000e+00 -5.10236549e-02
   0.00000000e+00 -3.47633726e-03  1.30705004e-03 -1.45830788e-02
  -1.91048898e-04  6.65082271e-02  4.23240653e-05  6.22415897e-06
  -2.89330197e-03  1.47606982e-03]
 [ 0.00000000e+00 -7.75642553e-03 -1.20498190e-02 -1.30002801e-02
   1.60518166e-03 -8.52950206e-03  0.00000000e+00 -5.10236549e-02
   0.00000000e+00 -5.21450589e-03 -8.32384500e-03 -2.48168183e-02
  -3.13866046e-04  0.00000000e+00  4.71047219e-05  1.56530415e-05
   3.72914476e-03  1.64764925e-03]]


In [20]:
# verify that vectorization works
results = features_train[0:3] - features_test[0]
print (results[0] - (features_train[0]-features_test[0]))
# should print all 0's if results[0] == (features_train[0]-features_test[0])
print (results[1] - (features_train[1]-features_test[0]))
# should print all 0's if results[1] == (features_train[1]-features_test[0])
print (results[2] - (features_train[2]-features_test[0]))
# should print all 0's if results[2] == (features_train[2]-features_test[0])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [21]:
diff = features_train - features_test[0]

In [22]:
print (diff[-1].sum())

-0.0934339605841801


In [23]:
print (np.sum(diff**2, axis=1)[15]) # take sum of squares across each row, and print the 16th sum
print (np.sum(diff[15]**2)) # print the sum of squares for the 16th row -- should be same as above

0.003307059028786791
0.0033070590287867904


In [24]:
distances = np.sqrt(np.sum(diff**2, axis=1))

In [25]:
distances[100]

0.023708232449603735

In [26]:
def compute_distances(train_matrix, query_vector):
    diff = train_matrix - query_vector
    distances = np.sqrt(np.sum(diff**2, axis=1))
    return distances

In [28]:
third_house_distance = compute_distances(features_train, features_test[2])
print (third_house_distance.argsort()[0], min(third_house_distance))
print (third_house_distance[382])

382 0.002860495267507927
0.002860495267507927


In [35]:
def k_nearest_neighbors(k, feature_train, features_query):
    distances = np.array(compute_distances(feature_train, features_query))
    sort_arg = np.argsort(distances, axis = 0)[:k]
    neighbors = sort_arg
    return neighbors

In [36]:
print(k_nearest_neighbors(4, features_train, features_test[2]))

[ 382 1149 4087 3142]


In [41]:
def predict_output_of_query(k, features_train, output_train, features_query):
    k_neighbors = k_nearest_neighbors(k, features_train, features_query)
    prediction = np.mean(output_train[k_neighbors])
    return prediction

In [42]:
print(predict_output_of_query(4, features_train, output_train, features_test[2]))

413987.5


In [49]:
def predict_output(k, features_train, output_train, features_query):
    num_of_rows = features_query.shape[0]
    predictions = []
    for i in range(num_of_rows):
        avg_value = predict_output_of_query(k, features_train, output_train, features_query[i])
        predictions.append(avg_value)
    
    return predictions

In [50]:
predicted_values = predict_output(10, features_train, output_train, features_test[0:10])
print (predicted_values)
print (predicted_values.index(min(predicted_values)))

[881300.0, 431860.0, 460595.0, 430200.0, 766750.0, 667420.0, 350032.0, 512800.7, 484000.0, 457235.0]
6


In [51]:
print (min(predicted_values))

350032.0


In [55]:
rss_all = []
for k in range(1,16):    
    predict_value = predict_output(k, features_train, output_train, features_valid)
    residual = (output_valid - predict_value)
    rss = np.sum(residual**2)
    rss_all.append(rss)

In [56]:
print (rss_all)

[105453830251561.0, 83445073504025.5, 72692096019202.56, 71946721652091.69, 69846517419718.6, 68899544353180.836, 68341973450051.09, 67361678735491.5, 68372727958976.09, 69335048668556.74, 69523855215598.83, 69049969587246.17, 70011254508263.69, 70908698869034.34, 71106928385945.16]


In [57]:
print (rss_all.index(min(rss_all)))

7


In [58]:
predict_value = predict_output(14, features_train, output_train, features_test)
residual = (output_test - predict_value)
rss = sum(residual**2)
print (rss)

133006256365677.28
