In [1]:
setwd(paste0(Sys.getenv('ROOT'), '/R'))
getwd()

In [9]:
library(Metrics)
source('main.R')

data <- kaggle.house$loadData()

df.combined <- kaggle.house$getCombinedDataset(data$train, data$test) %>% (kaggle.house$na$fixAll)

df.training <- 
    df.combined %>% 
    filter(dataSource == "train") %>% 
    mutate(sale_price_log = log(SalePrice)) %>%
    select(-dataSource, -SalePrice)

In [10]:
source('main.R')
trainset <- kaggle.house$trans$doItAll(df.training)
# Should be only numeric columns
stopifnot(0 == trainset %>% purrr::map(function (col) { !is.numeric(col) }) %>% unlist %>% sum)

In [19]:
trainset %>% 
    gather(name, value, -sale_price_log) %>%
    mutate(value.log = log(value + 1)) %>%
    group_by(name) %>% nest %>%
    mutate(
        model =  map(data, ~lm(sale_price_log ~ value, .)),
        glance = map(model, broom::glance),
        r2 = map_dbl(glance, 'r.squared'),
        
        model.log =  map(data, ~lm(sale_price_log ~ value.log, .)),
        glance.log = map(model.log, broom::glance),
        r2.log = map_dbl(glance.log, 'r.squared'),
        
        best.r2 = pmax(r2, r2.log)
    ) %>%
    select(name, r2, r2.log, best.r2) %>%
    arrange(desc(best.r2)) %>%
    filter(best.r2 > 0.2)
    

name,r2,r2.log,best.r2
OverallQual,0.6677904,0.6433122,0.6677904
Neighborhood.new,0.5640165,0.5640765,0.5640765
GrLivArea,0.4912982,0.5332704,0.5332704
GarageCars,0.4632501,0.4116007,0.4632501
ExterQual.new,0.4603784,0.4606815,0.4606815
BsmtQual.new,0.4512077,0.4516368,0.4516368
KitchenQual.new,0.443775,0.4444692,0.4444692
GarageArea,0.4236546,0.2069329,0.4236546
GarageFinish.new,0.3807905,0.3807381,0.3807905
TotalBsmtSF,0.374708,0.1390081,0.374708
