In [78]:
setwd(paste0(Sys.getenv('ROOT'), '/R'))
getwd()

In [79]:
library(Metrics)
source('main.R')

data <- kaggle.house$loadData()

df.combined <- kaggle.house$getCombinedDataset(data$train, data$test) %>% (kaggle.house$na$fixAll)

df.training <- 
    df.combined %>% 
    filter(dataSource == "train") %>% 
    mutate(sale_price_log = log(SalePrice)) %>%
    select(-dataSource)

In [88]:
partition <- caret::createDataPartition(y=df.training$sale_price_log, p=.5, list=F, times=1)

trainset <- df.training[partition,] %>% select(-SalePrice)

testset <- df.training[-partition,] %>% select(-sale_price_log)

setdiff(trainset %>% colnames, testset %>% colnames)
setdiff(testset %>% colnames, trainset %>% colnames)

In [89]:
tran.res <- kaggle.house$trans$doItAll(trainset, testset %>% select(-SalePrice))
trainset.ready <- tran.res$df.training
testset.ready <- tran.res$df.testing

# Should be only numeric columns
stopifnot(0 == trainset.ready %>% purrr::map(function (col) { !is.numeric(col) }) %>% unlist %>% sum)
stopifnot(0 == testset.ready %>% purrr::map(function (col) { !is.numeric(col) }) %>% unlist %>% sum)

c(trainset.ready %>% nrow, testset.ready %>% nrow)

c('1. ', setdiff(trainset.ready %>% colnames, testset.ready %>% colnames))
c('2. ', setdiff(testset.ready %>% colnames, trainset.ready %>% colnames))

In [90]:
# sanity check

testset %>% group_by(BldgType) %>% summarise(median(log(SalePrice)))

cbind(testset.ready %>% select(BldgType.new), testset %>% select(BldgType)) %>% 
    group_by(BldgType) %>% summarise(min(BldgType.new), max(BldgType.new))


BldgType,median(log(SalePrice))
1Fam,12.02575
2fmCon,11.71587
Duplex,11.82591
Twnhs,11.89613
TwnhsE,12.05641


BldgType,min(BldgType.new),max(BldgType.new)
1Fam,12.03913,12.03913
2fmCon,11.81226,11.81226
Duplex,11.73407,11.73407
Twnhs,11.77529,11.77529
TwnhsE,12.05597,12.05597


In [92]:
model.lm <- lm(sale_price_log ~ ., data=trainset.ready)

y_predicted <- predict(model.lm, testset.ready) %>% as.vector
y_actual <- testset %>% mutate(sale_price_log = log(SalePrice)) %>% select(sale_price_log) %>% `[[`(1)
rmse(y_predicted, y_actual)

sp_predicted <- y_predicted %>% exp
sp_actual <- testset %>% select(SalePrice) %>% `[[`(1)
mean(100 * abs(sp_actual - sp_predicted) / sp_actual)

glance(model.lm)
tidy(model.lm) %>% arrange(p.value) %>% filter(p.value < 0.1)

“prediction from a rank-deficient fit may be misleading”

r.squared,adj.r.squared,sigma,statistic,p.value,df,logLik,AIC,BIC,deviance,df.residual
0.9370307,0.9297132,0.1058743,128.0528,0,77,644.9006,-1133.801,-775.4369,7.330931,654


term,estimate,std.error,statistic,p.value
X2ndFlrSF,0.0002534787,2.346005e-05,10.804699,3.757998e-25
OverallQual,0.05464718,0.006297153,8.678078,3.186243e-17
X1stFlrSF,0.0002397066,2.854763e-05,8.396724,2.846766e-16
OverallCond,0.04067998,0.005254481,7.74196,3.732257e-14
MSZoning.new,0.2519823,0.03738822,6.739616,3.489388e-11
BsmtFinSF1,0.0001664376,2.483797e-05,6.700932,4.472988e-11
SaleCondition.new,0.4055894,0.06653294,6.096069,1.858089e-09
Neighborhood.new,0.1263385,0.02544617,4.96493,8.778951e-07
is_full_functional,0.0788425,0.01805382,4.367081,1.463685e-05
ScreenPorch,0.0003090737,7.238177e-05,4.270049,2.244068e-05


In [93]:
model.lm <- lm(sale_price_log ~ ., data=trainset.ready %>% select(-has_pool))

y_predicted <- predict(model.lm, testset.ready) %>% as.vector
y_actual <- testset %>% mutate(sale_price_log = log(SalePrice)) %>% select(sale_price_log) %>% `[[`(1)
rmse(y_predicted, y_actual)

sp_predicted <- y_predicted %>% exp
sp_actual <- testset %>% select(SalePrice) %>% `[[`(1)
mean(100 * abs(sp_actual - sp_predicted) / sp_actual)

glance(model.lm)
tidy(model.lm) %>% arrange(p.value) %>% filter(p.value < 0.1)

“prediction from a rank-deficient fit may be misleading”

r.squared,adj.r.squared,sigma,statistic,p.value,df,logLik,AIC,BIC,deviance,df.residual
0.9370247,0.9298138,0.1057985,129.9454,0,76,644.8659,-1135.732,-781.962,7.331626,655


term,estimate,std.error,statistic,p.value
X2ndFlrSF,0.000253478,2.344324e-05,10.81241,3.47662e-25
OverallQual,0.05446424,0.00624964,8.714781,2.3775680000000003e-17
X1stFlrSF,0.000240052,2.849345e-05,8.424816,2.287399e-16
OverallCond,0.04068455,0.005250685,7.748428,3.555334e-14
MSZoning.new,0.2521952,0.03735166,6.751915,3.219963e-11
BsmtFinSF1,0.0001660724,2.477685e-05,6.702724,4.416976e-11
SaleCondition.new,0.4060677,0.06645755,6.110181,1.707772e-09
Neighborhood.new,0.126132,0.02541443,4.963006,8.859959e-07
is_full_functional,0.078986,0.01803169,4.380399,1.379104e-05
ScreenPorch,0.000308622,7.230719e-05,4.268206,2.261704e-05
