In [1]:
setwd(paste0(Sys.getenv('ROOT'), '/R'))
getwd()

In [33]:
source('main.R')

data <- kaggle.house$loadData()

df.combined <- kaggle.house$getCombinedDataset(data$train, data$test) %>% (kaggle.house$na$fixAll)

df.training.raw <- 
    df.combined %>% 
    filter(dataSource == "train") %>% 
    mutate(sale_price_log = log(SalePrice)) %>%
    select(-dataSource, -SalePrice)
    
df.testing.raw  <- 
    df.combined %>% 
    filter(dataSource == "test") %>% 
    select(-dataSource, -SalePrice)

tran.res <- kaggle.house$trans$doItAll(df.training.raw, df.testing.raw)
df.training <- tran.res$df.training
df.testing <- tran.res$df.testing

# Should be only numeric columns
stopifnot(0 == df.training %>% purrr::map(function (col) { !is.numeric(col) }) %>% unlist %>% sum)
stopifnot(0 == df.testing %>% purrr::map(function (col) { !is.numeric(col) }) %>% unlist %>% sum)

df.training2 <- 
    df.training %>% mutate(
        BsmtFinSF2=log(BsmtFinSF2 + 1),
        MiscVal=log(MiscVal + 1),
        LotArea=log(LotArea + 1),
        LowQualFinSF=log(LowQualFinSF + 1),
        OpenPorchSF=log(OpenPorchSF + 1),
        EnclosedPorch=log(EnclosedPorch + 1)
    )
df.testing2 <- 
    df.testing %>% mutate(
        BsmtFinSF2=log(BsmtFinSF2 + 1),
        MiscVal=log(MiscVal + 1),
        LotArea=log(LotArea + 1),
        LowQualFinSF=log(LowQualFinSF + 1),
        OpenPorchSF=log(OpenPorchSF + 1),
        EnclosedPorch=log(EnclosedPorch + 1)
    )

In [91]:
partitions <- caret::createDataPartition(y=df.training$sale_price_log, p=.5, list=F, times=20)

trains <- partitions %>% as.data.frame %>% 
          purrr::map(function (sample) { lm(sale_price_log ~ ., data=df.training[sample,]) })
tests <- partitions %>% as.data.frame %>% 
          purrr::map(function (sample) { lm(sale_price_log ~ ., data=df.training[-sample,]) })

trains2 <- partitions %>% as.data.frame %>% 
          purrr::map(function (sample) { lm(sale_price_log ~ ., data=df.training2[sample,]) })
tests2 <- partitions %>% as.data.frame %>% 
          purrr::map(function (sample) { lm(sale_price_log ~ ., data=df.training2[-sample,]) })

In [92]:
trains %>% purrr::map(function (model) { summary(model)$r.squared }) %>% unlist %>% mean
trains2 %>% purrr::map(function (model) { summary(model)$r.squared }) %>% unlist %>% mean

In [99]:
?union

In [98]:
trains %>% purrr::map(function (model) { tidy(model) %>% filter(p.value < 0.2) %>% select(term) }) 

term
LotArea
OverallQual
OverallCond
BsmtFinSF1
BsmtFinSF2
BsmtUnfSF
X1stFlrSF
X2ndFlrSF
BsmtFullBath
FullBath

term
(Intercept)
LotFrontage
LotArea
OverallQual
OverallCond
YearBuilt
BsmtFinSF1
BsmtFinSF2
BsmtUnfSF
X1stFlrSF

term
LotArea
OverallQual
OverallCond
YearRemodAdd
X1stFlrSF
X2ndFlrSF
LowQualFinSF
BsmtFullBath
BsmtHalfBath
HalfBath

term
LotFrontage
LotArea
OverallQual
OverallCond
BsmtFinSF1
BsmtFinSF2
BsmtUnfSF
X1stFlrSF
X2ndFlrSF
LowQualFinSF

term
(Intercept)
LotArea
OverallQual
OverallCond
BsmtFinSF1
BsmtFinSF2
X1stFlrSF
X2ndFlrSF
LowQualFinSF
BsmtFullBath

term
LotArea
OverallQual
OverallCond
BsmtFinSF1
BsmtFinSF2
BsmtUnfSF
X1stFlrSF
X2ndFlrSF
BsmtFullBath
FullBath

term
LotArea
OverallQual
OverallCond
YearBuilt
YearRemodAdd
BsmtFinSF1
BsmtFinSF2
BsmtUnfSF
X1stFlrSF
X2ndFlrSF

term
LotArea
OverallQual
OverallCond
YearRemodAdd
BsmtFinSF1
BsmtFinSF2
BsmtUnfSF
X1stFlrSF
X2ndFlrSF
BsmtFullBath

term
LotArea
OverallQual
OverallCond
BsmtFinSF1
BsmtFinSF2
BsmtUnfSF
X1stFlrSF
X2ndFlrSF
FullBath
HalfBath

term
LotFrontage
LotArea
OverallQual
OverallCond
YearRemodAdd
BsmtFinSF1
BsmtFinSF2
BsmtUnfSF
X1stFlrSF
X2ndFlrSF

term
(Intercept)
LotArea
OverallQual
OverallCond
YearRemodAdd
BsmtFinSF1
BsmtFinSF2
BsmtUnfSF
X1stFlrSF
X2ndFlrSF

term
(Intercept)
LotFrontage
LotArea
OverallQual
OverallCond
YearRemodAdd
BsmtFinSF1
BsmtFinSF2
BsmtUnfSF
X1stFlrSF

term
(Intercept)
LotArea
OverallQual
OverallCond
YearBuilt
YearRemodAdd
BsmtFinSF1
BsmtFinSF2
BsmtUnfSF
X1stFlrSF

term
LotFrontage
LotArea
OverallQual
OverallCond
YearBuilt
YearRemodAdd
X1stFlrSF
X2ndFlrSF
LowQualFinSF
BsmtFullBath

term
(Intercept)
LotArea
OverallQual
OverallCond
BsmtFinSF2
X1stFlrSF
X2ndFlrSF
LowQualFinSF
BsmtFullBath
HalfBath

term
OverallQual
OverallCond
YearRemodAdd
BsmtFinSF1
BsmtFinSF2
X1stFlrSF
X2ndFlrSF
BsmtFullBath
FullBath
HalfBath

term
LotArea
OverallQual
OverallCond
YearBuilt
BsmtFinSF1
BsmtFinSF2
X1stFlrSF
X2ndFlrSF
HalfBath
BedroomAbvGr

term
LotFrontage
LotArea
OverallQual
OverallCond
YearBuilt
YearRemodAdd
BsmtFinSF1
BsmtFinSF2
X1stFlrSF
X2ndFlrSF

term
LotFrontage
LotArea
OverallQual
OverallCond
YearRemodAdd
BsmtFinSF1
BsmtFinSF2
BsmtUnfSF
X1stFlrSF
X2ndFlrSF

term
LotArea
OverallQual
OverallCond
YearRemodAdd
BsmtFinSF1
BsmtFinSF2
BsmtUnfSF
X1stFlrSF
X2ndFlrSF
BsmtFullBath


In [88]:
x <- tidy(trains[[1]])
x %>% filter(p.value < 0.2)

term,estimate,std.error,statistic,p.value
LotArea,1.137148e-06,8.13701e-07,1.397501,0.1627377
OverallQual,0.05936847,0.007585392,7.826685,2.025456e-14
OverallCond,0.03650917,0.006190921,5.897212,5.932018e-09
X1stFlrSF,0.0002753192,3.505241e-05,7.854499,1.654241e-14
X2ndFlrSF,0.0002258774,3.048919e-05,7.408444,3.965819e-13
LowQualFinSF,0.0001889051,0.0001240109,1.523293,0.1281694
BsmtFullBath,0.05675407,0.01465853,3.871745,0.0001189279
HalfBath,0.02105185,0.01471698,1.430447,0.1530672
GarageCars,0.05788703,0.01631102,3.548952,0.0004145159
WoodDeckSF,7.293203e-05,4.520412e-05,1.613394,0.1071422


In [36]:
partition <- createDataPartition(y=df.training$sale_price_log, p=.5, list=F)

df.training.train <- df.training[partition,]
df.training.test <- df.training[-partition,]

model_1 = lm(sale_price_log ~ ., data=df.training.train)
summary(model_1)


Call:
lm(formula = sale_price_log ~ ., data = df.training.train)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.75363 -0.05871  0.00750  0.06838  0.44689 

Coefficients: (3 not defined because of singularities)
                         Estimate Std. Error t value Pr(>|t|)    
(Intercept)            -2.953e+00  8.267e+00  -0.357 0.721072    
LotFrontage             6.121e-05  1.561e-04   0.392 0.695181    
LotArea                 2.327e-06  8.798e-07   2.646 0.008352 ** 
OverallQual             5.658e-02  7.322e-03   7.727 4.15e-14 ***
OverallCond             3.682e-02  6.270e-03   5.873 6.83e-09 ***
YearBuilt              -6.675e-05  4.484e-04  -0.149 0.881707    
YearRemodAdd            7.230e-04  3.893e-04   1.857 0.063764 .  
MasVnrArea             -5.209e-06  3.565e-05  -0.146 0.883863    
BsmtFinSF1              4.606e-05  2.787e-05   1.652 0.098924 .  
BsmtFinSF2              4.480e-05  4.082e-05   1.098 0.272716    
BsmtUnfSF               1.815e-05  2.696e-05   0

In [37]:
df.training2.train <- df.training2[partition,]
df.training2.test <- df.training2[-partition,]

model_2 = lm(sale_price_log ~ ., data=df.training2.train)
summary(model_2)


Call:
lm(formula = sale_price_log ~ ., data = df.training2.train)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.74502 -0.05587  0.00842  0.06595  0.46684 

Coefficients: (1 not defined because of singularities)
                         Estimate Std. Error t value Pr(>|t|)    
(Intercept)            -2.610e+00  8.193e+00  -0.319 0.750204    
LotFrontage            -3.977e-05  1.574e-04  -0.253 0.800583    
LotArea                 5.362e-02  1.365e-02   3.929 9.44e-05 ***
OverallQual             5.952e-02  7.287e-03   8.168 1.63e-15 ***
OverallCond             3.603e-02  6.174e-03   5.835 8.47e-09 ***
YearBuilt              -4.086e-05  4.453e-04  -0.092 0.926909    
YearRemodAdd            7.995e-04  3.851e-04   2.076 0.038300 *  
MasVnrArea             -6.683e-07  3.546e-05  -0.019 0.984969    
BsmtFinSF1             -9.541e-06  6.949e-05  -0.137 0.890831    
BsmtFinSF2             -1.197e-03  6.016e-03  -0.199 0.842323    
BsmtUnfSF              -4.282e-05  6.992e-05  -