In [4]:
library(magrittr)
library(dplyr)

In [5]:
train_set = read.csv("../data/kc_house_train_data.csv")
test_set = read.csv("../data/kc_house_test_data.csv")

In [58]:
make_feature_matrix = function(data, ...){
    cbind(intercept=rep(1, nrow(data)), as.matrix(select(data, ...)))
}

In [48]:
predict_outcome = function(feature_matrix, weights){
    feature_matrix %*% weights
}

In [52]:
regression_gradient_descent = function(
        feature_matrix, output, initial_weights=rep(0, ncol(feature_matrix)),
        step_size=0.000000001, tolerance=10){
    weights <- initial_weights
    counter <- 0
    repeat {
        rss_gradient = -2*t(feature_matrix) %*% (output - feature_matrix %*% weights)
        # cat(weights, "\n")
        # cat(norm(rss_gradient), "\n\n")
        if( norm(rss_gradient) < tolerance ) break
        weights <- weights - step_size * rss_gradient
        counter <- counter + 1
    }
    cat("converged after", counter, "iterations")
    weights
}

In [69]:
rss = function(feature_matrix, output, weights){
    c(t(output - feature_matrix %*% weights) %*% (output - feature_matrix %*% weights))
}

In [81]:
output = train_set$price
feature_matrix = make_feature_matrix(train_set, sqft_living)
initial_weights=c(-47000, 1)

mod1_weights = regression_gradient_descent(feature_matrix, output, initial_weights,
                                           step_size=7e-12, tolerance=2.5e7)
mod1_weights %>% round(., 1)

converged after 11 iterations

0,1
intercept,-46999.9
sqft_living,281.9


In [96]:
test_output = test_set$price
test_feature_matrix = make_feature_matrix(test_set, sqft_living)
rss(test_feature_matrix, test_output, mod1_weights) %>% signif(., 2) %>% format

In [97]:
feature_matrix = make_feature_matrix(train_set, sqft_living, sqft_living15)
initial_weights = c(-100000, 1, 1)

mod2_weights = regression_gradient_descent(feature_matrix, output, initial_weights,
                                           step_size=4e-12, tolerance=1e9)
mod2_weights %>% round(., 1)

converged after 288 iterations

0,1
intercept,-100000.0
sqft_living,245.1
sqft_living15,65.2


In [98]:
test_feature_matrix2 = make_feature_matrix(test_set, sqft_living, sqft_living15)
rss(test_feature_matrix2, test_output, mod2_weights) %>% as.numeric %>% signif(., 2) %>% format

In [104]:
predictions = cbind(
    predict_outcome(test_feature_matrix, mod1_weights), 
    predict_outcome(test_feature_matrix2, mod2_weights), 
    test_output)
colnames(predictions) = c("mod1_prediction", "mod2_prediction", "actual_vals")
predictions %>% round %>% head

mod1_prediction,mod2_prediction,actual_vals
356135,366635,310000
784641,762682,650000
435070,386331,233000
607037,636974,580500
260284,269599,535000
691610,713115,605000
