In [1]:
setwd(paste0(Sys.getenv('ROOT'), '/R'))
getwd()

In [3]:
library(Metrics)
source('main.R')

data <- kaggle.house$loadData()

df.combined <- kaggle.house$getCombinedDataset(data$train, data$test) %>% (kaggle.house$na$fixAll)

df.training <- 
    df.combined %>% 
    filter(dataSource == "train") %>% 
    mutate(sale_price_log = log(SalePrice)) %>%
    select(-dataSource)

In [4]:
partition <- caret::createDataPartition(y=df.training$sale_price_log, p=.5, list=F, times=1)

trainset <- df.training[partition,] %>% select(-SalePrice)

testset <- df.training[-partition,] %>% select(-sale_price_log)

setdiff(trainset %>% colnames, testset %>% colnames)
setdiff(testset %>% colnames, trainset %>% colnames)

In [5]:
tran.res <- kaggle.house$trans$doItAll(trainset, testset %>% select(-SalePrice))
trainset.ready <- tran.res$df.training
testset.ready <- tran.res$df.testing

# Should be only numeric columns
stopifnot(0 == trainset.ready %>% purrr::map(function (col) { !is.numeric(col) }) %>% unlist %>% sum)
stopifnot(0 == testset.ready %>% purrr::map(function (col) { !is.numeric(col) }) %>% unlist %>% sum)

c(trainset.ready %>% nrow, testset.ready %>% nrow)

c('1. ', setdiff(trainset.ready %>% colnames, testset.ready %>% colnames))
c('2. ', setdiff(testset.ready %>% colnames, trainset.ready %>% colnames))

In [6]:
# sanity check

testset %>% group_by(BldgType) %>% summarise(median(log(SalePrice)))

cbind(testset.ready %>% select(BldgType.new), testset %>% select(BldgType)) %>% 
    group_by(BldgType) %>% summarise(min(BldgType.new), max(BldgType.new))


BldgType,median(log(SalePrice))
1Fam,12.02874
2fmCon,11.75587
Duplex,11.81303
Twnhs,11.65235
TwnhsE,12.05467


BldgType,min(BldgType.new),max(BldgType.new)
1Fam,12.03469,12.03469
2fmCon,11.75182,11.75182
Duplex,11.83822,11.83822
Twnhs,11.90834,11.90834
TwnhsE,12.05815,12.05815


In [7]:
model.lm <- lm(sale_price_log ~ ., data=trainset.ready)

y_predicted <- predict(model.lm, testset.ready) %>% as.vector
y_actual <- testset %>% mutate(sale_price_log = log(SalePrice)) %>% select(sale_price_log) %>% `[[`(1)
rmse(y_predicted, y_actual)

sp_predicted <- y_predicted %>% exp
sp_actual <- testset %>% select(SalePrice) %>% `[[`(1)
mean(100 * abs(sp_actual - sp_predicted) / sp_actual)

glance(model.lm)
tidy(model.lm) %>% arrange(p.value) %>% filter(p.value < 0.1)

“prediction from a rank-deficient fit may be misleading”

r.squared,adj.r.squared,sigma,statistic,p.value,df,logLik,AIC,BIC,deviance,df.residual
0.9189375,0.9095174,0.118725,97.55062,4.2610130000000003e-308,77,561.1596,-966.3192,-607.9549,9.218534,654


term,estimate,std.error,statistic,p.value
OverallQual,0.06447665,0.007105403,9.074312,1.3282700000000001e-18
has_pool,-3.241782,0.4364825,-7.42706,3.47802e-13
PoolArea,0.005220754,0.0007184423,7.266769,1.052473e-12
X1stFlrSF,0.0002534552,3.512151e-05,7.216523,1.483164e-12
OverallCond,0.03752664,0.005630715,6.66463,5.640727e-11
X2ndFlrSF,0.0001622815,2.764535e-05,5.870119,6.925416e-09
Neighborhood.new,0.1682946,0.02884632,5.834179,8.50437e-09
MSZoning.new,0.2130467,0.04085844,5.214264,2.478842e-07
BsmtFullBath,0.06721038,0.01325704,5.069789,5.191475e-07
standard_roof_material,0.1883262,0.04158228,4.529002,7.043557e-06


In [8]:
N = 20

partitions <- caret::createDataPartition(y=df.training$sale_price_log, p=.5, list=F, times=N) %>% 
                as.data.frame %>% as.list

buildModel <- function (partition) { 
    trainset <- df.training[partition,] %>% select(-SalePrice)
    testset <- df.training[-partition,] %>% select(-sale_price_log)

    tran.res <- kaggle.house$trans$doItAll(trainset, testset %>% select(-SalePrice))
    trainset.ready <- tran.res$df.training
    testset.ready <- tran.res$df.testing

    model <- lm(sale_price_log ~ ., data=trainset.ready)
    
    y_predicted <- predict(model, testset.ready) %>% as.vector
    y_actual <- testset %>% mutate(sale_price_log = log(SalePrice)) %>% select(sale_price_log) %>% `[[`(1)
    
    list(model=model, rmse=rmse(y_predicted, y_actual))
}

fits <- partitions %>% as.data.frame %>% as.list %>% map(buildModel)

“prediction from a rank-deficient fit may be misleading”

In [9]:
rmse.sample <- fits %>% map(function(item) { item$rmse }) %>% unlist %>% as.vector
rmse.sample

In [10]:
r.squared.sample <- fits %>% map(function(item) { glance(item$model)[['r.squared']] }) %>% unlist %>% as.vector
r.squared.sample

In [36]:
coeff.sample <- fits %>% map(function(item) { tidy(item$model) }) %>% bind_rows
coeff.sample %>% 
    arrange(term, estimate) %>% 
    group_by(term) %>% 
    mutate(n=n()) %>% 
    filter(p.value < 0.00000001, n == N) %>%
    distinct(term) %>%
    `[[`(1) %>% paste(collapse = '+')

In [60]:
model.lm <- lm(sale_price_log ~ Neighborhood.new+OverallQual+log(X1stFlrSF+1)+log(X2ndFlrSF+1), data=trainset.ready)
glance(model.lm)
tidy(model.lm)

y_predicted <- predict(model.lm, testset.ready) %>% as.vector
y_actual <- testset %>% mutate(sale_price_log = log(SalePrice)) %>% select(sale_price_log) %>% `[[`(1)
rmse(y_predicted, y_actual)

sp_predicted <- y_predicted %>% exp
sp_actual <- testset %>% select(SalePrice) %>% `[[`(1)
mean(100 * abs(sp_actual - sp_predicted) / sp_actual)

r.squared,adj.r.squared,sigma,statistic,p.value,df,logLik,AIC,BIC,deviance,df.residual
0.8309533,0.8300219,0.1627256,892.1679,1.7673939999999998e-278,5,292.5338,-573.0677,-545.5012,19.22421,726


term,estimate,std.error,statistic,p.value
(Intercept),2.92591496,0.33062749,8.849582,6.637788e-18
Neighborhood.new,0.43420367,0.028021325,15.495472,5.33637e-47
OverallQual,0.11097429,0.006850603,16.199199,1.30266e-50
log(X1stFlrSF + 1),0.44541666,0.024945126,17.855859,2.157191e-59
log(X2ndFlrSF + 1),0.02412738,0.002145638,11.244853,3.7164130000000005e-27
