In [2]:
setwd(paste0(Sys.getenv('R_SOURCES'), '/house_prices'))
source('main.R')
house_prices$helpers$import_libs()

combined_dataset <-
    house_prices$helpers$load_data() %>%
    (house_prices$outliers$remove_outliers) %>%
    (house_prices$missing$fix_all) %>%
    select(-Id)

test_that("should be no NA values except SalePrice column", {
    expect_equal(sum(is.na(combined_dataset %>% select(-SalePrice))), 0)
})

combined_dataset %>% dim
house_prices$helpers$load_data() %>% dim

### Passing named list to mutate (and probably other dplyr verbs)
https://community.rstudio.com/t/passing-named-list-to-mutate-and-probably-other-dplyr-verbs/2553/6

In [6]:
Trans <- tribble(
    ~tran_name, ~tran_fn,
    'log',      function(x) log(x+1),
    'sqrt',     function(x) sqrt(x),
    'invcube',  function(x) x**(1/3)
)

for (row in 1:nrow(Trans)) {
    Trans[[row, "tran_fn"]](4) %>% print
}
        
Trans

[1] 1.609438
[1] 2
[1] 1.587401


tran_name,tran_fn
log,"function (x) , log(x + 1)"
sqrt,"function (x) , sqrt(x)"
invcube,"function (x) , x^(1/3)"


In [22]:
mod <- source('transform_numeric_vars.R', local = TRUE)$value

df1 <- mod$calc_tran_config_step1(
    dataset = combined_dataset %>%
              select(-SalePrice) %>%
              select_if(is.numeric),
    trans = Trans
)

df1 %>% head

var,x,log,sqrt,invcube
LotFrontage,65,4.189655,8.062258,4.020726
LotFrontage,80,4.394449,8.944272,4.308869
LotFrontage,68,4.234107,8.246211,4.081655
LotFrontage,60,4.110874,7.745967,3.914868
LotFrontage,84,4.442651,9.165151,4.379519
LotFrontage,85,4.454347,9.219544,4.39683


In [24]:
mod <- source('transform_numeric_vars.R', local = TRUE)$value

df2 <-mod$calc_tran_config_step2(df1)

df2 %>% 
filter(var == 'LotArea') %>% 
filter(row_number() < 3)

var,tran,value,value_normed
LotArea,x,8450.0,-0.21639955
LotArea,x,9600.0,-0.06909653
LotArea,log,9.04204,-0.10174374
LotArea,log,9.169623,0.14940974
LotArea,sqrt,91.923882,-0.21081495
LotArea,sqrt,97.97959,0.02444502
LotArea,invcube,20.368181,-0.18204458
LotArea,invcube,21.253171,0.06700854


In [29]:
mod <- source('transform_numeric_vars.R', local = TRUE)$value

df3 <- mod$calc_tran_config_step3(df2)

df3 %>% filter(var == 'YrSold')

var,tran,value_normed,k
YrSold,invcube,-1.3633378,619
YrSold,invcube,-0.6027227,691
YrSold,invcube,0.1576398,621
YrSold,invcube,0.9177499,647
YrSold,invcube,1.6776078,339
YrSold,log,-1.363451,619
YrSold,log,-0.602662,691
YrSold,log,0.1577483,621
YrSold,log,0.9177802,647
YrSold,log,1.677434,339


In [32]:
mod <- source('transform_numeric_vars.R', local = TRUE)$value

df4 <- mod$calc_tran_config_step4(df3)

df4 %>% filter(var == 'LotArea')

var,tran,L2_distance
LotArea,invcube,10.78862
LotArea,log,13.31376
LotArea,sqrt,13.39006
LotArea,x,50.03278


In [40]:
mod <- source('transform_numeric_vars.R', local = TRUE)$value

df5 <- mod$calc_tran_config_step5(df4)

df5 %>% filter(var %in% c('LotArea', 'GrLivArea', 'OverallQual'))

var,best_tran
GrLivArea,"log , 91.1579415746179"
LotArea,"invcube , 78.436890821096"
OverallQual,"log , 29.9443565176069"


In [51]:
mod <- source('transform_numeric_vars.R', local = TRUE)$value

tranConfig1 <- mod$calc_tran_config_step6(df5)

tranConfig2 <- mod$get_transformation_config(
    dataset = combined_dataset %>%
              select(-SalePrice) %>%
              select_if(is.numeric),
    trans = Trans
)

test_that("should be equal", {
    expect_equal(
        tranConfig1 %>% select(-tran_fn), 
        tranConfig2 %>% select(-tran_fn)
    )
})

tranConfig2

var,tran_name,progress_score,tran_fn
GrLivArea,log,91.15794157,"function (x) , log(x + 1)"
X1stFlrSF,log,90.77165686,"function (x) , log(x + 1)"
BsmtUnfSF,sqrt,79.9760307,"function (x) , sqrt(x)"
LotArea,invcube,78.43689082,"function (x) , x^(1/3)"
TotRmsAbvGrd,log,30.68501506,"function (x) , log(x + 1)"
OverallQual,log,29.94435652,"function (x) , log(x + 1)"
GarageCars,log,24.95676474,"function (x) , log(x + 1)"
BedroomAbvGr,log,22.37508601,"function (x) , log(x + 1)"
OverallCond,log,16.24723935,"function (x) , log(x + 1)"
FullBath,invcube,8.79733251,"function (x) , x^(1/3)"


In [144]:
mod <- source('transform_numeric_vars.R', local = TRUE)$value

system.time({
    df1 <- mod$apply_transform(combined_dataset, tranConfig1)
})

system.time({
    df2 <- mod$apply_transform2(combined_dataset, tranConfig1)
})

test_that("should be equal", {
    expect_equal(df1, df2)
})

   user  system elapsed 
  0.004   0.000   0.004 

   user  system elapsed 
   0.02    0.00    0.02 