In [1]:
setwd(paste0(Sys.getenv('ROOT'), '/R'))
getwd()

In [3]:
library(Metrics)
source('main.R')

data <- kaggle.house$loadData()

df.combined <- kaggle.house$getCombinedDataset(data$train, data$test) %>% (kaggle.house$na$fixAll)

df.training <- 
    df.combined %>% 
    filter(dataSource == "train") %>% 
    mutate(sale_price_log = log(SalePrice)) %>%
    select(-dataSource)

In [4]:
partition <- caret::createDataPartition(y=df.training$sale_price_log, p=.5, list=F, times=1)

trainset <- df.training[partition,] %>% select(-SalePrice)

testset <- df.training[-partition,] %>% select(-sale_price_log)

setdiff(trainset %>% colnames, testset %>% colnames)
setdiff(testset %>% colnames, trainset %>% colnames)

In [5]:
tran.res <- kaggle.house$trans$doItAll(trainset, testset %>% select(-SalePrice))
trainset.ready <- tran.res$df.training
testset.ready <- tran.res$df.testing

# Should be only numeric columns
stopifnot(0 == trainset.ready %>% purrr::map(function (col) { !is.numeric(col) }) %>% unlist %>% sum)
stopifnot(0 == testset.ready %>% purrr::map(function (col) { !is.numeric(col) }) %>% unlist %>% sum)

c(trainset.ready %>% nrow, testset.ready %>% nrow)

c('1. ', setdiff(trainset.ready %>% colnames, testset.ready %>% colnames))
c('2. ', setdiff(testset.ready %>% colnames, trainset.ready %>% colnames))

In [6]:
# sanity check

testset %>% group_by(BldgType) %>% summarise(median(log(SalePrice)))

cbind(testset.ready %>% select(BldgType.new), testset %>% select(BldgType)) %>% 
    group_by(BldgType) %>% summarise(min(BldgType.new), max(BldgType.new))


BldgType,median(log(SalePrice))
1Fam,12.03112
2fmCon,11.72597
Duplex,11.75587
Twnhs,11.73318
TwnhsE,12.02719


BldgType,min(BldgType.new),max(BldgType.new)
1Fam,12.02874,12.02874
2fmCon,11.81967,11.81967
Duplex,11.82704,11.82704
Twnhs,11.90834,11.90834
TwnhsE,12.06681,12.06681


In [14]:
model.lm <- lm(sale_price_log ~ ., data=trainset.ready)

y_predicted <- predict(model.lm, testset.ready) %>% as.vector
y_actual <- testset %>% mutate(sale_price_log = log(SalePrice)) %>% select(sale_price_log) %>% `[[`(1)

paste0("Testset RMSE of sale_price_log: ", rmse(y_predicted, y_actual))

sp_predicted <- y_predicted %>% exp
sp_actual <- testset %>% select(SalePrice) %>% `[[`(1)

paste0("Testset SalePrice mean prediction error in %: ", mean(100 * abs(sp_actual - sp_predicted) / sp_actual))

glance(model.lm)
tidy(model.lm) %>% arrange(p.value) %>% filter(p.value < 0.1)

“prediction from a rank-deficient fit may be misleading”

r.squared,adj.r.squared,sigma,statistic,p.value,df,logLik,AIC,BIC,deviance,df.residual
0.9198736,0.9105622,0.1183822,98.79078,9.916579e-310,77,563.2732,-970.5464,-612.1822,9.165379,654


term,estimate,std.error,statistic,p.value
has_pool,-3.915731,0.4461804,-8.776115,1.466269e-17
PoolArea,0.006295255,0.0007462382,8.435986,2.104228e-16
OverallQual,0.0537289,0.00674084,7.970654,7.043781e-15
X2ndFlrSF,0.0001993465,2.587912e-05,7.702988,4.939619e-14
X1stFlrSF,0.0002666292,3.507409e-05,7.601886,1.016644e-13
OverallCond,0.04046574,0.005699261,7.100173,3.257721e-12
Neighborhood.new,0.1982228,0.02952735,6.713195,4.134913e-11
Condition1.new,0.240228,0.04573909,5.252138,2.036153e-07
GarageCars,0.06607362,0.01483146,4.454963,9.869009e-06
MSZoning.new,0.1688297,0.03964042,4.259028,2.354441e-05


In [30]:
sample.data <- df.training %>% sample_n(round(0.75 * nrow(df.training)))
sample.data %>% nrow
N = 3
partitions <- caret::createDataPartition(y=sample.data$sale_price_log, p=.6, list=F, times=N) %>% 
                as.data.frame %>% as.list

partitions[[1]] %>% length

In [162]:
N = 3

partitions <- caret::createDataPartition(y=df.training$sale_price_log, p=.5, list=F, times=N) %>% 
                as.data.frame %>% as.list

buildModel <- function (partition) { 
    trainset <- df.training[partition,] %>% select(-SalePrice)
    testset <- df.training[-partition,] %>% select(-sale_price_log)

    tran.res <- kaggle.house$trans$doItAll(trainset, testset %>% select(-SalePrice))
    trainset.ready <- tran.res$df.training
    testset.ready <- tran.res$df.testing

    model <- lm(sale_price_log ~ ., data=trainset.ready)
    
    y_predicted <- predict(model, testset.ready) %>% as.vector
    y_actual <- testset %>% mutate(sale_price_log = log(SalePrice)) %>% select(sale_price_log) %>% `[[`(1)
    
    list(model=model, rmse=rmse(y_predicted, y_actual))
}

fits <- partitions %>% as.data.frame %>% as.list %>% map(buildModel)

“prediction from a rank-deficient fit may be misleading”

In [9]:
rmse.sample <- fits %>% map(function(item) { item$rmse }) %>% unlist %>% as.vector
rmse.sample

In [10]:
r.squared.sample <- fits %>% map(function(item) { glance(item$model)[['r.squared']] }) %>% unlist %>% as.vector
r.squared.sample

In [36]:
coeff.sample <- fits %>% map(function(item) { tidy(item$model) }) %>% bind_rows
coeff.sample %>% 
    arrange(term, estimate) %>% 
    group_by(term) %>% 
    mutate(n=n()) %>% 
    filter(p.value < 0.00000001, n == N) %>%
    distinct(term) %>%
    `[[`(1) %>% paste(collapse = '+')

In [160]:
df.tr <- 
    trainset.ready %>% 
    mutate(
        GrLivArea.log = log(GrLivArea),
        X1stFlrSF.log = log(X1stFlrSF),
        X2ndFlrSF.log = log(X2ndFlrSF + 1),
        LotArea.log = log(LotArea)
    )

df.te <- 
    testset.ready %>% 
    mutate(
        GrLivArea.log = log(GrLivArea),
        X1stFlrSF.log = log(X1stFlrSF),
        X2ndFlrSF.log = log(X2ndFlrSF + 1),
        LotArea.log = log(LotArea)
    )

model.lm <- lm(
    sale_price_log ~ OverallQual+Neighborhood.new+GrLivArea.log+X1stFlrSF.log+BsmtFullBath+
                     LotArea.log+has_central_air+YearRemodAdd+GarageCars, 
    data=df.tr)


glance(model.lm)
tidy(model.lm)

y_predicted <- predict(model.lm, df.te) %>% as.vector
y_actual <- testset %>% mutate(sale_price_log = log(SalePrice)) %>% select(sale_price_log) %>% `[[`(1)
rmse(y_predicted, y_actual)

sp_predicted <- y_predicted %>% exp
sp_actual <- testset %>% select(SalePrice) %>% `[[`(1)
mean(100 * abs(sp_actual - sp_predicted) / sp_actual)

r.squared,adj.r.squared,sigma,statistic,p.value,df,logLik,AIC,BIC,deviance,df.residual
0.8668367,0.8651745,0.1483862,521.4895,1.039939e-308,10,362.4922,-702.9843,-652.4458,15.87532,721


term,estimate,std.error,statistic,p.value
(Intercept),0.51335179,0.6846550006,0.7497963,0.453622
OverallQual,0.08052613,0.006915687,11.6439812,7.804972e-29
Neighborhood.new,0.2758793,0.0272502951,10.1239013,1.2923170000000001e-22
GrLivArea.log,0.31553726,0.0235967954,13.3720388,1.3657059999999999e-36
X1stFlrSF.log,0.09243856,0.0232050584,3.9835522,7.478674e-05
BsmtFullBath,0.0639037,0.0113162861,5.6470563,2.348617e-08
LotArea.log,0.0822236,0.0128285174,6.4094389,2.638188e-10
has_central_air,0.16017446,0.0221639189,7.2268112,1.262187e-12
YearRemodAdd,0.00187511,0.0003446257,5.4410047,7.264625e-08
GarageCars,0.06376398,0.0100036715,6.3740579,3.28415e-10


In [None]:
N = 20

partitions <- caret::createDataPartition(y=df.training$sale_price_log, p=.5, list=F, times=N) %>% 
                as.data.frame %>% as.list


fits <- partitions %>% as.data.frame %>% as.list %>% map(buildModel)