In [1]:
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv('c_kc_house_train_data.csv/kc_house_train_data.csv')
test_data = pd.read_csv('c_kc_house_test_data.csv/kc_house_test_data.csv')

In [3]:
train_data.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

# define gradient descent function

In [4]:
def get_numpy_data(data, features, output_column):
    data['constant'] = 1
    fea = ['constant']
    fea.extend(features)
    x_matrix = data[fea].to_numpy()
    y_vector = data[output_column].to_numpy()
    return x_matrix, y_vector

In [5]:
def get_prediction(x_matrix, weights):
    return x_matrix @ weights

In [6]:
def feature_derivative(errors, feature):
    return 2 * errors @ feature

In [7]:
def reg_gd(x_matrix, y_true, initial_weights, step_size=0.01, tolerance=1e-5):
    weights = np.array(initial_weights, dtype=float)
    converged = False
    num_steps = 0
    while not converged:
        pred = get_prediction(x_matrix, weights)
        errors = y_true - pred
        gradient_square_norm = 0
        for i in range(len(weights)):
            partiali = feature_derivative(errors, x_matrix[:,i])
            weights[i] += step_size * partiali
            gradient_square_norm += partiali ** 2
        num_steps += 1
#         if num_steps % 1 == 0:
#             print(num_steps, weights, '%.6e'%np.sqrt(gradient_square_norm))
        converged = True if gradient_square_norm <= tolerance ** 2 else False
    return weights, num_steps

# model1: fit simple regression model

In [13]:
feature_matrix, output = get_numpy_data(train_data, ['sqft_living'], 'price')
w,steps = reg_gd(feature_matrix, output, initial_weights= [-47000,1], step_size=7e-12, tolerance=2.5e7)
print(w)

[-46999.88716555    281.91211918]


# predict

In [9]:
test_feature, test_output = get_numpy_data(test_data, ['sqft_living'], 'price')
test_pred = get_prediction(test_feature, w)
print(test_pred[0])
rss1 = np.linalg.norm(test_pred- test_output) ** 2

356134.4432550024


# model2: multiple regression

In [10]:
feature_matrix,ouput = get_numpy_data(train_data, ['sqft_living','sqft_living15'], 'price')
ini_weight = [-100000,1,1] 
w2, steps = reg_gd(feature_matrix, output, initial_weights=ini_weight, step_size= 4e-12, tolerance=1e9)

In [11]:
test_feature,_ = get_numpy_data(test_data, ['sqft_living','sqft_living15'], 'price')
test_pred2 = get_prediction(test_feature, w2)
print(test_pred2[0], test_output[0])
rss2 = np.linalg.norm(test_pred2 - test_output) ** 2

366651.4116294939 310000.0


In [12]:
print('%.2e %.2e'%(rss1, rss2))

2.75e+14 2.70e+14
