In [3]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
import math

In [4]:
sales = pd.read_csv('kc_house_data.csv')
#train_data , test_data = train_test_split(sales, test_size=0.2)
train_data = pd.read_csv('kc_house_train_data.csv')
test_data = pd.read_csv('kc_house_test_data.csv')

In [22]:
def get_numpy_data(data_frame, features, output):
    data_frame['constant'] = 1 # this is how you add a constant column to an SFrame
    # add the column 'constant' to the front of the features list so that we can extract it along with the others:
    features = ['constant'] + features # this is how you combine two lists
    # select the columns of data_SFrame given by the features list into the SFrame features_sframe (now including constant):
    features_frame = data_frame[features]
    # the following line will convert the features_SFrame into a numpy matrix:
    feature_matrix = features_frame.as_matrix()
    # assign the column of data_sframe associated with the output to the SArray output_sarray
    output_array = data_frame[output]
    # the following will convert the SArray into a numpy array by first converting it to a list
    output_array = output_array.as_matrix()
    return(feature_matrix, output_array)

In [23]:
(example_features, example_output) = get_numpy_data(sales, ['sqft_living'], 'price') # the [] around 'sqft_living' makes it a list
print (example_features[0]) # this accesses the first row of the data the ':' indicates 'all columns'
print (example_output[0]) # and the corresponding output

[   1 1180]
221900.0


In [26]:
my_weights = np.array([1., 1.])
my_features = example_features[0,:]
predicted_value = np.dot(my_weights, my_features)
print(predicted_value)

1181.0


In [27]:
def predict_output(feature_matrix, weights):
    # assume feature_matrix is a numpy matrix containing the features as columns and weights is a corresponding numpy array
    # create the predictions vector by using np.dot()
    predictions = np.dot(feature_matrix, weights)
    return(predictions)

In [29]:
test_predictions = predict_output(example_features, my_weights)
print (test_predictions[0]) # should be 1181.0
print (test_predictions[1]) # should be 2571.0

1181.0
2571.0


In [33]:
def feature_derivative(errors, feature):
    # Assume that errors and feature are both numpy arrays of the same length (number of data points)
    # compute twice the dot product of these vectors as 'derivative' and return the value
    derivative = 2*np.dot(errors, feature)
    return(derivative)

In [34]:
(example_features, example_output) = get_numpy_data(sales, ['sqft_living'], 'price') 
my_weights = np.array([0., 0.]) # this makes all the predictions 0
test_predictions = predict_output(example_features, my_weights) 
# just like SFrames 2 numpy arrays can be elementwise subtracted with '-': 
errors = test_predictions - example_output # prediction errors in this case is just the -example_output
feature = example_features[:,0] # let's compute the derivative with respect to 'constant', the ":" indicates "all rows"
derivative = feature_derivative(errors, feature)
print (derivative)
print (-np.sum(example_output)*2) # should be the same as derivative

-23345850016.0
-23345850016.0


In [38]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False 
    weights = np.array(initial_weights) # make sure it's a numpy array
    while not converged:
        # compute the predictions based on feature_matrix and weights using your predict_output() function
        predictions = predict_output(feature_matrix, weights)
        # compute the errors as predictions - output
        errors = predictions - output
        gradient_sum_squares = 0 # initialize the gradient sum of squares
        # while we haven't reached the tolerance yet, update each feature's weight
        for i in range(len(weights)): # loop over each weight
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            derivative = feature_derivative(errors, feature_matrix[:, i])
            # Update the features weight
            weights[i] = weights[i] - step_size*derivative
            # add the squared value of the derivative to the gradient magnitude (for assessing convergence)
            gradient_sum_squares += derivative*derivative
            # subtract the step size times the derivative from the current weight
            
        # compute the square-root of the gradient sum of squares to get the gradient matnigude:
        gradient_magnitude = math.sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)

In [39]:
# let's test out the gradient descent
simple_features = ['sqft_living']
my_output = 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [40]:
my_computed_weights = regression_gradient_descent(simple_feature_matrix , output , initial_weights , step_size , tolerance)
print(my_computed_weights)

[-46999.88716555    281.91211918]


In [43]:
(test_simple_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)
test_predictions = predict_output(test_simple_feature_matrix , my_computed_weights)
print(test_predictions)

[ 356134.443255    784640.86440132  435069.83662406 ...,  663418.65315598
  604217.10812919  240550.47439317]


In [46]:
test_RSS = np.dot(test_output-test_predictions , test_output-test_predictions)
print(test_RSS)

2.75400044902e+14


In [47]:
model_features = ['sqft_living', 'sqft_living15'] # sqft_living15 is the average squarefeet for the nearest 15 neighbors. 
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features, my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [49]:
model_2_weights = regression_gradient_descent(feature_matrix , output , initial_weights , step_size , tolerance)
print(model_2_weights)

[ -9.99999688e+04   2.45072603e+02   6.52795267e+01]


In [50]:
(test_simple_feature_matrix_model_2, test_output_model_2) = get_numpy_data(test_data, model_features, my_output)
test_predictions_model_2 = predict_output(test_simple_feature_matrix_model_2 , model_2_weights)
print(test_predictions_model_2)

[ 366651.41162949  762662.39850726  386312.09557541 ...,  682087.39916306
  585579.27901327  216559.20391786]


In [51]:
test_RSS_model_2 = np.dot(test_output_model_2-test_predictions_model_2 , test_output_model_2-test_predictions_model_2)
print(test_RSS_model_2)

2.7026344363e+14


In [53]:
print(test_data['price'])

0        310000
1        650000
2        233000
3        580500
4        535000
5        605000
6        775000
7        292500
8        289000
9        571000
10       349000
11       360000
12       243500
13       247500
14       470000
15       480000
16       770000
17       519950
18       527700
19       420000
20       890000
21       282950
22       255000
23       420000
24       807100
25       284000
26       917500
27       425000
28       260000
29       297000
         ...   
4199     436952
4200     435000
4201     349000
4202     450000
4203     337500
4204     850000
4205     579000
4206     890776
4207     810000
4208     600000
4209     406000
4210    1378000
4211     579950
4212    1450000
4213     670000
4214     459000
4215     589999
4216     388000
4217     305000
4218     337000
4219    1700000
4220     399950
4221     579000
4222     490000
4223     399950
4224    1088000
4225     350000
4226     610685
4227     400000
4228     402101
Name: price, dtype: floa