In [2]:
setwd(paste0(Sys.getenv('ROOT'), '/R'))
getwd()

In [4]:
source('main.R')

data <- kaggle.house$loadData()

df.combined <- kaggle.house$getCombinedDataset(data$train, data$test) %>% (kaggle.house$na$fixAll)

df.training.raw <- 
    df.combined %>% 
    filter(dataSource == "train") %>% 
    mutate(sale_price_log = log(SalePrice)) %>%
    select(-dataSource, -SalePrice)
    
df.testing.raw  <- 
    df.combined %>% 
    filter(dataSource == "test") %>% 
    select(-dataSource, -SalePrice)

In [5]:
df.training.numeric <- 
    df.training.raw %>%
    select(-dplyr::one_of(kaggle.house$trans$type1TransContainer %>% names)) %>%
    select(-dplyr::one_of(kaggle.house$trans$type2TransContainer %>% names))

# Should be only numeric columns
stopifnot(0 == df.training.numeric %>% purrr::map(function (col) { !is.numeric(col) }) %>% unlist %>% sum)

df.training.numeric %>% colnames %>% sort

In [49]:
x <- 
    df.training.numeric %>% 
    gather(name, value, -sale_price_log) %>%
    mutate(value.log = log(value + 1), value.sqrt = sqrt(value)) %>%
    group_by(name) %>%
    nest %>% 
    mutate(
        mod = map(data, ~lm(sale_price_log ~ value, .)),
        glance = map(mod, broom::glance),
        r.squared = map_dbl(glance, 'r.squared'),
        
        mod.log = map(data, ~lm(sale_price_log ~ value.log, .)),
        glance.log = map(mod.log, broom::glance),
        r.squared.log = map_dbl(glance.log, 'r.squared'),
        
        mod.sqrt = map(data, ~lm(sale_price_log ~ value.sqrt, .)),
        glance.sqrt = map(mod.sqrt, broom::glance),
        r.squared.sqrt = map_dbl(glance.sqrt, 'r.squared')
    ) %>%
    select(-data)

x %>% 
    select(name, r.squared, r.squared.log, r.squared.sqrt) %>%
    mutate(
        r2 = r.squared * 100,
        log.gain = r.squared.log / r.squared, 
        sqrt.gain = r.squared.sqrt / r.squared,
        max.gain = pmax(log.gain, sqrt.gain)
     ) %>%
    select(name, r2, log.gain, sqrt.gain,max.gain) %>%
    filter(max.gain > 1) %>%
    arrange(desc(r2), max.gain)

#      gather(fun, value, -name) %>%
#      arrange(name, desc(value)) %>%
#      group_by(name) %>%
#      filter(row_number() == 1) %>%
#      arrange(desc(value)) %>%
#      filter(value > 1.2) %>%
#      select(`Attribute Name` = name, `Transformation` = fun, `Transformation R.Squared Gain` = value)

name,r2,log.gain,sqrt.gain,max.gain
GrLivArea,49.129817225,1.0854313,1.063073,1.085431
X1stFlrSF,35.638636813,1.0405171,1.039713,1.040517
TotRmsAbvGrd,28.560709183,1.0209757,1.019056,1.020976
Fireplaces,23.956074218,1.0811631,1.099671,1.099671
MasVnrArea,18.213701315,0.9379289,1.065036,1.065036
WoodDeckSF,11.164624765,1.0540031,1.124479,1.124479
OpenPorchSF,10.307501084,2.0524076,1.774221,2.052408
HalfBath,9.858485326,1.044712,1.06054,1.06054
LotArea,6.6213527,2.4154951,2.038663,2.415495
BsmtFullBath,5.580182959,1.0231674,1.027112,1.027112


In [35]:
x <- 
    df.training.numeric %>% 
    gather(name, value, -sale_price_log) %>%
    mutate(value.x2 = value*value, value.x15 = value^(1.5)) %>%
    group_by(name) %>%
    nest %>% 
    mutate(
        mod = map(data, ~lm(sale_price_log ~ value, .)),
        glance = map(mod, broom::glance),
        r.squared = map_dbl(glance, 'r.squared'),
        
        mod.x2 = map(data, ~lm(sale_price_log ~ value + value.x2, .)),
        glance.x2 = map(mod.x2, broom::glance),
        r.squared.x2 = map_dbl(glance.x2, 'r.squared'),
        
        mod.x15 = map(data, ~lm(sale_price_log ~ value + value.x15, .)),
        glance.x15 = map(mod.x15, broom::glance),
        r.squared.x15 = map_dbl(glance.x15, 'r.squared')
    ) %>%
    select(-data, -contains("mod"))

x %>% 
    select(name, r.squared, r.squared.x2, r.squared.x15) %>%
    mutate(
        x2 = r.squared.x2 / r.squared,
        x15 = r.squared.x15 / r.squared
    ) %>%
    select(name, x2, x15) %>%
    filter(x2 > 1.2 | x15 > 1.2)

name,x2,x15
LotFrontage,1.30184,1.716061
LotArea,2.036836,2.261263
OverallCond,22.680661,26.521123
BsmtFinSF2,364.816081,361.47433
BsmtUnfSF,1.303291,1.196365
X2ndFlrSF,1.850655,1.95189
LowQualFinSF,4.086344,4.030085
BsmtHalfBath,1.601609,1.601609
GarageYrBlt,2.933359,2.928227
OpenPorchSF,1.731154,1.911578
