In [2]:
setwd(paste0(Sys.getenv('ROOT'), '/R'))
getwd()

In [4]:
library(Metrics)
source('main.R')

data <- kaggle.house$loadData()

df.combined <- kaggle.house$getCombinedDataset(data$train, data$test) %>% (kaggle.house$na$fixAll)

df.training <- 
    df.combined %>% 
    filter(dataSource == "train") %>% 
    mutate(sale_price_log = log(SalePrice)) %>%
    select(-dataSource)

In [5]:
partition <- caret::createDataPartition(y=df.training$sale_price_log, p=.5, list=F, times=1)

trainset <- df.training[partition,] %>% select(-SalePrice)

testset <- df.training[-partition,] %>% select(-sale_price_log)

setdiff(trainset %>% colnames, testset %>% colnames)
setdiff(testset %>% colnames, trainset %>% colnames)

In [6]:
tran.res <- kaggle.house$trans$doItAll(trainset, testset %>% select(-SalePrice))
trainset.ready <- tran.res$df.training
testset.ready <- tran.res$df.testing

# Should be only numeric columns
stopifnot(0 == trainset.ready %>% purrr::map(function (col) { !is.numeric(col) }) %>% unlist %>% sum)
stopifnot(0 == testset.ready %>% purrr::map(function (col) { !is.numeric(col) }) %>% unlist %>% sum)

c(trainset.ready %>% nrow, testset.ready %>% nrow)

c('1. ', setdiff(trainset.ready %>% colnames, testset.ready %>% colnames))
c('2. ', setdiff(testset.ready %>% colnames, trainset.ready %>% colnames))

In [7]:
# sanity check

testset %>% group_by(BldgType) %>% summarise(median(log(SalePrice)))

cbind(testset.ready %>% select(BldgType.new), testset %>% select(BldgType)) %>% 
    group_by(BldgType) %>% summarise(min(BldgType.new), max(BldgType.new))


BldgType,median(log(SalePrice))
1Fam,12.01854
2fmCon,11.72597
Duplex,11.82026
Twnhs,11.77529
TwnhsE,12.06248


BldgType,min(BldgType.new),max(BldgType.new)
1Fam,12.04649,12.04649
2fmCon,11.80485,11.80485
Duplex,11.81855,11.81855
Twnhs,11.86137,11.86137
TwnhsE,12.02457,12.02457


In [8]:
model.lm <- lm(sale_price_log ~ ., data=trainset.ready)

y_predicted <- predict(model.lm, testset.ready) %>% as.vector
y_actual <- testset %>% mutate(sale_price_log = log(SalePrice)) %>% select(sale_price_log) %>% `[[`(1)
rmse(y_predicted, y_actual)

sp_predicted <- y_predicted %>% exp
sp_actual <- testset %>% select(SalePrice) %>% `[[`(1)
mean(100 * abs(sp_actual - sp_predicted) / sp_actual)

glance(model.lm)
tidy(model.lm) %>% arrange(p.value) %>% filter(p.value < 0.1)

“prediction from a rank-deficient fit may be misleading”

r.squared,adj.r.squared,sigma,statistic,p.value,df,logLik,AIC,BIC,deviance,df.residual
0.9137315,0.903559,0.1254985,89.82326,0,78,521.1599,-884.3199,-521.3612,10.28467,653


term,estimate,std.error,statistic,p.value
has_pool,-4.028008,0.4698745,-8.57252,7.314185e-17
PoolArea,0.006069919,0.0007571164,8.017154,5.005725e-15
OverallQual,0.05733988,0.007266367,7.891135,1.265933e-14
OverallCond,0.04797452,0.006316445,7.595176,1.06816e-13
Neighborhood.new,0.2174003,0.03023997,7.189171,1.788831e-12
X2ndFlrSF,0.0001470715,2.775301e-05,5.299298,1.591791e-07
ScreenPorch,0.0004622124,9.290397e-05,4.975163,8.346758e-07
Condition1.new,0.2798898,0.06056245,4.621507,4.59181e-06
standard_electrical,-0.08618088,0.02046677,-4.210771,2.902254e-05
X1stFlrSF,0.0001487414,3.685698e-05,4.035636,6.09053e-05


In [8]:
N = 20

partitions <- caret::createDataPartition(y=df.training$sale_price_log, p=.5, list=F, times=N) %>% 
                as.data.frame %>% as.list

buildModel <- function (partition) { 
    trainset <- df.training[partition,] %>% select(-SalePrice)
    testset <- df.training[-partition,] %>% select(-sale_price_log)

    tran.res <- kaggle.house$trans$doItAll(trainset, testset %>% select(-SalePrice))
    trainset.ready <- tran.res$df.training
    testset.ready <- tran.res$df.testing

    model <- lm(sale_price_log ~ ., data=trainset.ready)
    
    y_predicted <- predict(model, testset.ready) %>% as.vector
    y_actual <- testset %>% mutate(sale_price_log = log(SalePrice)) %>% select(sale_price_log) %>% `[[`(1)
    
    list(model=model, rmse=rmse(y_predicted, y_actual))
}

fits <- partitions %>% as.data.frame %>% as.list %>% map(buildModel)

“prediction from a rank-deficient fit may be misleading”

In [9]:
rmse.sample <- fits %>% map(function(item) { item$rmse }) %>% unlist %>% as.vector
rmse.sample

In [10]:
r.squared.sample <- fits %>% map(function(item) { glance(item$model)[['r.squared']] }) %>% unlist %>% as.vector
r.squared.sample

In [36]:
coeff.sample <- fits %>% map(function(item) { tidy(item$model) }) %>% bind_rows
coeff.sample %>% 
    arrange(term, estimate) %>% 
    group_by(term) %>% 
    mutate(n=n()) %>% 
    filter(p.value < 0.00000001, n == N) %>%
    distinct(term) %>%
    `[[`(1) %>% paste(collapse = '+')

In [73]:
df.tr <- 
    trainset.ready %>% 
    mutate(
        GrLivArea.log = log(GrLivArea),
        X1stFlrSF.log = log(X1stFlrSF),
        X2ndFlrSF.log = log(X2ndFlrSF + 1)
    )

df.te <- 
    testset.ready %>% 
    mutate(
        GrLivArea.log = log(GrLivArea),
        X1stFlrSF.log = log(X1stFlrSF),
        X2ndFlrSF.log = log(X2ndFlrSF + 1)
    )

model.lm <- lm(
    sale_price_log ~ OverallQual+Neighborhood.new+GrLivArea.log+X2ndFlrSF.log, 
    data=df.tr)


glance(model.lm)
tidy(model.lm)

y_predicted <- predict(model.lm, df.te) %>% as.vector
y_actual <- testset %>% mutate(sale_price_log = log(SalePrice)) %>% select(sale_price_log) %>% `[[`(1)
rmse(y_predicted, y_actual)

sp_predicted <- y_predicted %>% exp
sp_actual <- testset %>% select(SalePrice) %>% `[[`(1)
mean(100 * abs(sp_actual - sp_predicted) / sp_actual)

r.squared,adj.r.squared,sigma,statistic,p.value,df,logLik,AIC,BIC,deviance,df.residual
0.821786,0.8208041,0.1710691,836.9385,3.698344e-270,5,255.9821,-499.9643,-472.3978,21.24613,726


term,estimate,std.error,statistic,p.value
(Intercept),2.68547163,0.356883505,7.524785,1.565977e-13
OverallQual,0.10007796,0.007335438,13.643079,6.806402e-38
Neighborhood.new,0.40180848,0.029131816,13.792771,1.3212099999999999e-38
GrLivArea.log,0.54423481,0.029830676,18.244133,1.685223e-61
X2ndFlrSF.log,-0.02067125,0.002435807,-8.486408,1.193861e-16
