In [1]:
import pandas as pd
import numpy as np


In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 
              'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 
              'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 
              'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [3]:
sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)
train_data = pd.read_csv('kc_house_train_data.csv', dtype=dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict)

In [4]:
sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0


In [5]:
sales.insert(0,'constant',1,True)
train_data.insert(0,'constant',1,True)
test_data.insert(0,'constant',1,True)
def get_numpy_data(data,features,output):
    
    
    features=['constant']+features
    
    
    
    features_matrix=data[features].to_numpy()
    
    
    output_array=data[output].to_numpy()
    
    return(features_matrix,output_array)

In [6]:

(example_features, example_output) = get_numpy_data(sales, ['sqft_living'], 'price') 
print (example_features[0,:])
print (example_output[0]) 

[1.00e+00 1.18e+03]
221900.0


In [7]:
def predict_output(feature_matrix, weights):
    
    predictions = np.dot(feature_matrix, weights)

    return(predictions)

In [8]:

my_weights = np.array([1., 1.]) 
my_features = example_features[0,] 
predicted_value = np.dot(my_features, my_weights)
print (predicted_value)

1181.0


In [9]:
test_predictions = predict_output(example_features, my_weights)
print (test_predictions[0]) 
print (test_predictions[1]) 

1181.0
2571.0


In [10]:
def feature_derivative(errors, feature):
    
    derivative = 2*np.dot(errors, feature)
    return(derivative)

In [11]:
(example_features, example_output) = get_numpy_data(sales, ['sqft_living'], 'price') 
my_weights = np.array([0., 0.])
test_predictions = predict_output(example_features, my_weights) 
errors = test_predictions - example_output

feature = example_features[:,0]
derivative = feature_derivative(errors, feature)
print (derivative)
print (-np.sum(example_output)*2)

-23345850016.0
-23345850016.0


In [12]:

print (example_output)
print (errors)
print (feature)

[221900. 538000. 180000. ... 402101. 400000. 325000.]
[-221900. -538000. -180000. ... -402101. -400000. -325000.]
[1. 1. 1. ... 1. 1. 1.]


In [13]:
from math import sqrt

In [14]:
def regression_gradient_descent(features_matrix,output,intial_weights,step_size,tolerance):
    converged=False
    weights=np.array(intial_weights)
    while not converged:
        predictions=predict_output(features_matrix,weights)
        
        errors=predictions-output
        gradient_sum_squares=0
        
        for i in range(len(weights)):
            derivative=feature_derivative(errors,features_matrix[:,i])
            gradient_sum_squares+=(derivative**2)
            
            weights[i]-=(step_size*derivative)
            
            gradient_magnitude=sqrt(gradient_sum_squares)
            if gradient_magnitude<tolerance:
                converged=True
        return weights

In [15]:
sales.head()

Unnamed: 0,constant,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,1,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690.0,7639.0
2,1,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,1,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0


In [19]:
(simple_feature_matrix, output) = get_numpy_data(train_data, ['sqft_living'], 'price')
#print(feature_matrix[0,:])
simple_features = ['sqft_living']
my_output = 'price'
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [20]:
test_weight=regression_gradient_descent(simple_feature_matrix,output,initial_weights,step_size,tolerance)
print(test_weight)

[-46999.85779866    354.86068692]


In [21]:
(test_simple_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)

In [22]:

test_predictions = predict_output(test_simple_feature_matrix, test_weight)
print (test_predictions)

[460450.92450055 999839.16862279 559811.91683885 ... 847249.0732461
 772728.32899237 314958.04286231]


In [23]:

print (test_predictions[0])

460450.92450054735


In [24]:

test_residuals = test_output - test_predictions
test_RSS = (test_residuals * test_residuals).sum()
print (test_RSS)

389725347125246.4


In [25]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features, my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [28]:
weight_2 = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)
print (weight_2)

[-99999.91164747    217.89658257    196.92903734]


In [29]:
(test_feature_matrix, test_output) = get_numpy_data(test_data, model_features, my_output)

test_predictions_2 = predict_output(test_feature_matrix, weight_2)
print (test_predictions_2)

[562125.88789642 964223.14684286 475440.15300729 ... 945360.65052958
 763721.33242302 323122.22066228]


In [30]:
print (test_predictions_2[0])

562125.8878964245


In [31]:
print (test_data['price'][0])

310000.0


In [32]:
test_residuals_2 = test_output - test_predictions_2
test_RSS_2 = (test_residuals_2**2).sum()
print (test_RSS_2)

463885088954683.7
