In [1]:
import pandas as pd
import numpy as np
from qgrid import show_grid as grid

dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int,
              'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float,
              'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 
              'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

train=pd.read_csv("kc_house_train_data.csv",dtype=dtype_dict)
test=pd.read_csv("kc_house_test_data.csv",dtype=dtype_dict)

Next write a function that takes 
- a data set, 
- a list of features (e.g. [‘sqft_living’, ‘bedrooms’]), to be used as inputs, 
- and a name of the output (e.g. ‘price’). 

This function should return a 
- features_matrix (2D array) consisting of:
    - first a column of ones followed by 
    - columns containing the values of the input features in the data set in the same order as the input list. 
- It should also return an output_array 
    - array of the values of the output in the data set (e.g. ‘price’).

In [2]:
def get_data(data_frame, features, output):
    '''args: 
    data_frame= array, dataframe alike
    features: list-like []
    output: str'''
    
    import numpy as np
    
    data_frame["constant"]=1
    features= ["constant"] + features
    
    features_frame= data_frame[features]
    feature_matrix= np.array(features_frame)
    
    output_array = np.array(data_frame[output])
    return(feature_matrix, output_array)

In [3]:
# test 

(example_features, example_output) = get_data(train, ['sqft_living'], 'price') # the [] around 'sqft_living' makes it a list
print(example_features[0,:]) # this accesses the first row of the data the ':' indicates 'all columns'
print(example_output[0]) # and the corresponding output

[  1.00000000e+00   1.18000000e+03]
221900.0


 If the features matrix is stored as a 2D array (or matrix) and the regression weights are stored as a 1D array 
- then the predicted output is just the dot product between the features matrix and the weights (with the weights on the right). 

Write a function ‘predict_output’ 
-  accepts a 2D array ‘feature_matrix’ and a 1D array ‘weights’ and returns a 1D array ‘predictions’

In [4]:
def predict_output(feature_matrix, weights):
    
    predictions = np.dot(feature_matrix, weights)
    return predictions

In [5]:
#test
my_weights = np.array([1., 1.]) # the example weights
my_features = example_features[0,] # we'll use the first data point
test_predictions = predict_output(example_features, my_weights)
print(test_predictions[0]) # should be 1181.0
print(test_predictions[1]) # should be 2571.0

1181.0
2571.0


If values of a single input feature in an array:
- ‘feature’ and 
- the prediction ‘errors’ (predictions - output) 

-- > then the derivative of the regression cost function with respect to the weight of ‘feature’ is just twice the dot product between ‘feature’ and ‘errors’. 

Write a function that accepts a ‘feature’ array and ‘error’ array and returns the ‘derivative’ (a single number).

In [6]:
def feature_derivative(errors, feature):
    
    derivative = 2*np.dot(errors, feature)
    
    return(derivative)


In [7]:
## test

(example_features, example_output) = get_data(train, ['sqft_living'], 'price') 
my_weights = np.array([0., 0.]) # this makes all the predictions 0
test_predictions = predict_output(example_features, my_weights) 
# just like SFrames 2 numpy arrays can be elementwise subtracted with '-': 
errors = test_predictions - example_output # prediction errors in this case is just the -example_output
feature = example_features[:,0] # let's compute the derivative with respect to 'constant', the ":" indicates "all rows"
derivative = feature_derivative(errors, feature)
print(derivative)
print(-np.sum(example_output)*2)# should be the same as derivative
print(example_output)
print(errors)
print(feature)

-18752698920.0
-18752698920.0
[ 221900.  538000.  180000. ...,  360000.  400000.  325000.]
[-221900. -538000. -180000. ..., -360000. -400000. -325000.]
[ 1.  1.  1. ...,  1.  1.  1.]


 Now we will use our predict_output and feature_derivative to write a gradient descent function

Write a gradient descent function that does the following:

- Accepts:
    - a  feature_matrix 2D array, 
    - a 1D output array, 
    - an array of initial weights, 
    - a step size and a convergence tolerance.
- While not converged updates each feature weight by subtracting the step size times the derivative for that feature given the current weights
- At each step computes the magnitude/length of the gradient (square root of the sum of squared components)
- When the magnitude of the gradient is smaller than the input tolerance returns the final weight vector.

In [18]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    from math import sqrt
    converged = False
    weights= np.array(initial_weights,dtype=object)
    
    while not converged:
        predictions = predict_output(feature_matrix, weights)
        error = predictions - output
        
        gradient_sum_squares = 0.0 #init the gradient
        #while not converged update each weight
        for i in range(len(weights)):
            derivative = feature_derivative(error, feature_matrix[:,i])
            gradient_sum_squares += derivative**2
            weights[i] =weights[i]- (step_size * derivative)
        
        gradient_magnitude = sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return weights

In [19]:
# test
# params to use
simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix, output) = get_data(train, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

simple_weights = regression_gradient_descent(simple_feature_matrix, output,initial_weights, step_size, tolerance)

In [20]:
simple_weights

array([-46999.88716554671, 281.91211917520917], dtype=object)

# Q1 What is the value of the weight for sqft_living from your gradient descent predicting house prices (model 1)? Round your answer to 1 decimal place.

In [21]:
ans1= simple_weights[1]
ans1

281.91211917520917

# Q2: What is the predicted price for the 1st house in the Test data set for model 1 (round to nearest dollar)?

In [12]:
(test_simple_feature_matrix, test_output)=get_data(test, simple_features, my_output)

In [13]:
test_predictions = predict_output(test_simple_feature_matrix, simple_weights)
test_predictions[0]

356134.4432550024

# Q3 What is the predicted price for the 1st house in the TEST data set for model 2 (round to nearest dollar)?



In [14]:
test_residuals = test_output - test_predictions
test_rss = (test_residuals**2).sum()
test_rss

275400044902128.78

In [25]:
model_features = ["sqft_living", "sqft_living15"]
my_output = "price"
initial_weights =np.array([-100000., 1., 1.]) #intercept, sqft_living, and sqft_living_15 respectively)
step_size = 4e-12
tolerance = 1e9

In [26]:
(feature_matrix, output) = get_data(train, model_features, my_output)

In [27]:
weights2= regression_gradient_descent(feature_matrix, output,initial_weights, step_size, tolerance)

In [28]:
weights2

array([-99999.96884887619, 245.0726034645802, 65.27952669888788], dtype=object)

In [34]:
(feat_matrix_test, test_output)=get_data(test, model_features, my_output)
predict_2=predict_output(feat_matrix_test, weights2)
predict_2[0]

366651.41162949393

# Which estimate was closer to the true price for the 1st house on the TEST data set, model 1 or model 2?

In [33]:
test.iloc[0]

id                    0114101516
date             20140528T000000
price                     310000
bedrooms                       3
bathrooms                      1
sqft_living                 1430
sqft_lot                   19901
floors                       1.5
waterfront                     0
view                           0
condition                      4
grade                          7
sqft_above                  1430
sqft_basement                  0
yr_built                    1927
yr_renovated                   0
zipcode                    98028
lat                      47.7558
long                    -122.229
sqft_living15               1780
sqft_lot15                 12697
constant                       1
Name: 0, dtype: object

# Which model (1 or 2) has lowest RSS on all of the TEST data?



In [35]:
test_rss_2=test_output -  predict_2
test_rss_2 = (test_rss_2**2).sum()
test_rss_2

270263443629803.4