In [35]:
setwd(paste0(Sys.getenv('ROOT'), '/R'))
getwd()

In [52]:
source('main.R')

data <- kaggle.house$loadData()

df.combined <- kaggle.house$getCombinedDataset(data$train, data$test) %>% (kaggle.house$na$fixAll)

df.training <- 
    df.combined %>% 
    filter(dataSource == "train") %>% 
    mutate(sale_price_log = log(SalePrice)) %>%
    select(-dataSource)

In [37]:
set.seed(12345)

partition <- caret::createDataPartition(y=df.training$sale_price_log, p=.5, list=F, times=1)
trainset <- df.training[partition,] %>% select(-SalePrice)
testset <- df.training[-partition,] %>% select(-sale_price_log)

stopifnot(setdiff(trainset %>% colnames, testset %>% colnames) == 'sale_price_log')
stopifnot(setdiff(testset %>% colnames, trainset %>% colnames) == 'SalePrice')

tran.res <- kaggle.house$trans$doItAll(trainset, testset %>% select(-SalePrice))
trainset.ready <- tran.res$trainset
testset.ready <- tran.res$testset

# Should be only numeric columns
stopifnot(0 == trainset.ready %>% purrr::map(function (col) { !is.numeric(col) }) %>% unlist %>% sum)
stopifnot(0 == testset.ready %>% purrr::map(function (col) { !is.numeric(col) }) %>% unlist %>% sum)

stopifnot(setdiff(trainset.ready %>% colnames, testset.ready %>% colnames) == 'sale_price_log')
stopifnot(setdiff(testset.ready %>% colnames, trainset.ready %>% colnames) == '')
stopifnot( !('SalePrice' %in% colnames(testset.ready)) )

model.lm <- lm(sale_price_log ~ ., data=trainset.ready)

suppressWarnings({
    y_test_predicted <- predict(model.lm, testset.ready) %>% as.vector
    y_test_actual <- testset %>% mutate(sale_price_log = log(SalePrice)) %>% select(sale_price_log) %>% `[[`(1)
})

paste0("Testset RMSE of sale_price_log: ", 
       rmse(y_test_predicted, y_test_actual))

sp_test_predicted <- y_test_predicted %>% exp
sp_test_actual <- testset %>% select(SalePrice) %>% `[[`(1)

paste0("Testset SalePrice mean prediction error in %: ", 
       mean(100 * abs(sp_test_actual - sp_test_predicted) / sp_test_actual))

glance(model.lm)
tidy(model.lm) %>% arrange(p.value) %>% filter(p.value < 0.1)

r.squared,adj.r.squared,sigma,statistic,p.value,df,logLik,AIC,BIC,deviance,df.residual
0.9399326,0.9328496,0.1002553,132.7028,0,78,685.3234,-1212.647,-849.6882,6.563385,653


term,estimate,std.error,statistic,p.value
X2ndFlrSF,0.0002377235,2.408838e-05,9.868802,1.6621919999999999e-21
X1stFlrSF,0.0002630662,3.076407e-05,8.551084,8.645254e-17
OverallCond,0.04172092,0.005119886,8.148797,1.875653e-15
OverallQual,0.04258193,0.005682838,7.493076,2.195593e-13
BsmtFinSF1,0.0001601635,2.793732e-05,5.732957,1.508772e-08
Neighborhood.new,0.1408396,0.02487221,5.662529,2.235988e-08
MSZoning.new,0.1856737,0.03563519,5.210403,2.529998e-07
LotArea,2.389362e-06,4.939044e-07,4.837701,1.640212e-06
is_full_functional,0.07391372,0.01817254,4.06733,5.336618e-05
BsmtUnfSF,9.636478e-05,2.537551e-05,3.79755,0.0001597592


In [59]:
set.seed(12345)
source('main.R')
suppressWarnings({
    runs <- kaggle.house$validate$trainMany(
        dataset=df.training, 
        N=50, 
        sample.share=0.8, 
        trainset.share=0.5, 
        modelFactory=function (df) {
            lm(sale_price_log ~ ., data=df)
        }
    )
})

In [49]:
rmse.sample <- 
    runs %>%
    map(function(item) { rmse(item$y_test_actual, item$y_test_predicted) }) %>% 
    unlist %>% as.vector

SalePrice.predictionError.sample <- 
    runs %>%
    map(function(item) { 
            actual <- item$y_test_actual
            predicted <- item$y_test_predicted
            mean(100 * abs(exp(actual) - exp(predicted)) / exp(actual)) 
    }) %>% 
    unlist %>% as.vector

r2.sample <- 
    runs %>% 
    map(function(item) { glance(item$model)[['r.squared']] }) %>% 
    unlist %>% as.vector

coeff.sample <- 
    runs %>% 
    map(function(item) { tidy(item$model) %>% select(term, estimate, p.value) }) %>%
    bind_rows %>% arrange(term)

summary(rmse.sample)
summary(SalePrice.predictionError.sample)
summary(r2.sample)

coeff.sample %>% filter(term == 'OverallQual') %>% select(estimate) %>% summary
coeff.sample %>% filter(term == 'LotArea') %>% select(estimate) %>% summary
coeff.sample %>% filter(term == 'has_central_air') %>% select(estimate) %>% summary

coeff.sample %>% 
    arrange(term, estimate) %>% 
    group_by(term) %>%
    filter(p.value < 1e-5) %>%
    mutate(n = n()) %>%
    distinct(term)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.1161  0.1418  0.1517  0.1663  0.1714  0.4347 

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
   8.483    9.806   10.077   33.675   11.106 1151.956 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.8890  0.9176  0.9287  0.9268  0.9366  0.9482 

    estimate      
 Min.   :0.03378  
 1st Qu.:0.04666  
 Median :0.05142  
 Mean   :0.05152  
 3rd Qu.:0.05751  
 Max.   :0.06369  

    estimate        
 Min.   :2.135e-07  
 1st Qu.:1.239e-06  
 Median :1.559e-06  
 Mean   :1.625e-06  
 3rd Qu.:1.890e-06  
 Max.   :3.228e-06  

    estimate        
 Min.   :-0.002698  
 1st Qu.: 0.044455  
 Median : 0.060826  
 Mean   : 0.060903  
 3rd Qu.: 0.073960  
 Max.   : 0.135755  

term
BsmtFinSF1
BsmtFinSF2
BsmtFullBath
BsmtUnfSF
Condition1.new
Condition2.new
ExterCond.new
Exterior1st.new
Exterior2nd.new
GarageCars


In [160]:
df.tr <- 
    trainset.ready %>% 
    mutate(
        GrLivArea.log = log(GrLivArea),
        X1stFlrSF.log = log(X1stFlrSF),
        X2ndFlrSF.log = log(X2ndFlrSF + 1),
        LotArea.log = log(LotArea)
    )

df.te <- 
    testset.ready %>% 
    mutate(
        GrLivArea.log = log(GrLivArea),
        X1stFlrSF.log = log(X1stFlrSF),
        X2ndFlrSF.log = log(X2ndFlrSF + 1),
        LotArea.log = log(LotArea)
    )

model.lm <- lm(
    sale_price_log ~ OverallQual+Neighborhood.new+GrLivArea.log+X1stFlrSF.log+BsmtFullBath+
                     LotArea.log+has_central_air+YearRemodAdd+GarageCars, 
    data=df.tr)


glance(model.lm)
tidy(model.lm)

y_predicted <- predict(model.lm, df.te) %>% as.vector
y_actual <- testset %>% mutate(sale_price_log = log(SalePrice)) %>% select(sale_price_log) %>% `[[`(1)
rmse(y_predicted, y_actual)

sp_predicted <- y_predicted %>% exp
sp_actual <- testset %>% select(SalePrice) %>% `[[`(1)
mean(100 * abs(sp_actual - sp_predicted) / sp_actual)

r.squared,adj.r.squared,sigma,statistic,p.value,df,logLik,AIC,BIC,deviance,df.residual
0.8668367,0.8651745,0.1483862,521.4895,1.039939e-308,10,362.4922,-702.9843,-652.4458,15.87532,721


term,estimate,std.error,statistic,p.value
(Intercept),0.51335179,0.6846550006,0.7497963,0.453622
OverallQual,0.08052613,0.006915687,11.6439812,7.804972e-29
Neighborhood.new,0.2758793,0.0272502951,10.1239013,1.2923170000000001e-22
GrLivArea.log,0.31553726,0.0235967954,13.3720388,1.3657059999999999e-36
X1stFlrSF.log,0.09243856,0.0232050584,3.9835522,7.478674e-05
BsmtFullBath,0.0639037,0.0113162861,5.6470563,2.348617e-08
LotArea.log,0.0822236,0.0128285174,6.4094389,2.638188e-10
has_central_air,0.16017446,0.0221639189,7.2268112,1.262187e-12
YearRemodAdd,0.00187511,0.0003446257,5.4410047,7.264625e-08
GarageCars,0.06376398,0.0100036715,6.3740579,3.28415e-10
