In [6]:
setwd(paste0(Sys.getenv('R_SOURCES'), '/house_prices'))

house_prices <- source('main.R', local = TRUE)$value

combined_dataset <-
    house_prices$helpers$load_data() %>%
    (house_prices$outliers$remove_outliers) %>%
    (house_prices$missing$fix_all) %>%
    select(-Id)

test_that("should be no NA values except SalePrice column", {
    expect_equal(sum(is.na(combined_dataset %>% select(-SalePrice))), 0)
})

combined_dataset %>% dim

### Passing named list to mutate (and probably other dplyr verbs)
https://community.rstudio.com/t/passing-named-list-to-mutate-and-probably-other-dplyr-verbs/2553/6

In [2]:
Trans <- tribble(
    ~tran_name, ~tran_fn,
    'log',      function(x) log(x+1),
    'sqrt',     function(x) sqrt(x),
    'inv3',  function(x) x**(1/3)
)

for (row in 1:nrow(Trans)) {
    Trans[[row, "tran_fn"]](4) %>% print
}
        
Trans

[1] 1.609438
[1] 2
[1] 1.587401


tran_name,tran_fn
log,"function (x) , log(x + 1)"
sqrt,"function (x) , sqrt(x)"
inv3,"function (x) , x^(1/3)"


In [8]:
TN <- source('transform_numeric_vars.R', local = TRUE)$value

df1 <- TN$calc_tran_config_step1(
    dataset = combined_dataset %>%
              select(-SalePrice) %>%
              select_if(is.numeric),
    trans = Trans
)

df1 %>% head

var,x,log,sqrt,inv3
LotFrontage,65,4.189655,8.062258,4.020726
LotFrontage,80,4.394449,8.944272,4.308869
LotFrontage,68,4.234107,8.246211,4.081655
LotFrontage,60,4.110874,7.745967,3.914868
LotFrontage,84,4.442651,9.165151,4.379519
LotFrontage,85,4.454347,9.219544,4.39683


In [9]:
df2 <- TN$calc_tran_config_step2(df1)

df2 %>% 
filter(var == 'LotArea') %>% 
filter(row_number() < 3)

var,tran,value,value_normed
LotArea,x,8450.0,-0.21639955
LotArea,x,9600.0,-0.06909653
LotArea,log,9.04204,-0.10174374
LotArea,log,9.169623,0.14940974
LotArea,sqrt,91.923882,-0.21081495
LotArea,sqrt,97.97959,0.02444502
LotArea,inv3,20.368181,-0.18204458
LotArea,inv3,21.253171,0.06700854


In [10]:
df3 <- TN$calc_tran_config_step3(df2)

df3 %>% filter(var == 'YrSold')

var,tran,value_normed,k
YrSold,inv3,-1.3633378,619
YrSold,inv3,-0.6027227,691
YrSold,inv3,0.1576398,621
YrSold,inv3,0.9177499,647
YrSold,inv3,1.6776078,339
YrSold,log,-1.363451,619
YrSold,log,-0.602662,691
YrSold,log,0.1577483,621
YrSold,log,0.9177802,647
YrSold,log,1.677434,339


In [11]:
df4 <- TN$calc_tran_config_step4(df3)

df4 %>% filter(var == 'LotArea')

var,tran,L2_distance
LotArea,inv3,10.78862
LotArea,log,13.31376
LotArea,sqrt,13.39006
LotArea,x,50.03278


In [30]:
df5 <- TN$calc_tran_config_step5(df4)

df5 %>% filter(var %in% c('LotArea', 'GrLivArea', 'OverallQual'))

var,best_tran
GrLivArea,"log , 91.1579415746179"
LotArea,"inv3 , 78.436890821096"
OverallQual,"log , 29.9443565176069"


In [53]:
tranConfig1 <- TN$calc_tran_config_step6(df5, Trans)

tranConfig2 <- TN$get_transformation_config(
    dataset = combined_dataset %>%
              select(-SalePrice) %>%
              select_if(is.numeric),
    trans = Trans
)

test_that("should be equal", {
    expect_equal(
        tranConfig1 %>% select(-tran_fn), 
        tranConfig2 %>% select(-tran_fn)
    )
})

tranConfig2

tranConfig2 %>% 
TN$filter_tran_config_by_r2(
    dataset = combined_dataset %>% mutate(price_log = log(SalePrice)), 
    target_var = price_log
)

var,tran_name,progress_score,tran_fn
GrLivArea,log,91.15794157,"function (x) , log(x + 1)"
X1stFlrSF,log,90.77165686,"function (x) , log(x + 1)"
BsmtUnfSF,sqrt,79.9760307,"function (x) , sqrt(x)"
LotArea,inv3,78.43689082,"function (x) , x^(1/3)"
TotRmsAbvGrd,log,30.68501506,"function (x) , log(x + 1)"
OverallQual,log,29.94435652,"function (x) , log(x + 1)"
GarageCars,log,24.95676474,"function (x) , log(x + 1)"
BedroomAbvGr,log,22.37508601,"function (x) , log(x + 1)"
OverallCond,log,16.24723935,"function (x) , log(x + 1)"
FullBath,inv3,8.79733251,"function (x) , x^(1/3)"


var,tran_name,progress_score,tran_fn,r2_x,r2_tran
GrLivArea,log,91.157942,"function (x) , log(x + 1)",0.52593095,0.5438044
LotArea,inv3,78.436891,"function (x) , x^(1/3)",0.06788313,0.154396
TotRmsAbvGrd,log,30.685015,"function (x) , log(x + 1)",0.28912299,0.2938676
Fireplaces,inv3,3.739704,"function (x) , x^(1/3)",0.24206172,0.2655916


In [51]:
TN <- source('transform_numeric_vars.R', local = TRUE)$value
house_prices <- source('main.R', local = TRUE)$value

Trans

tranConfig3 <- house_prices$trans$numeric$get_transformation_config(
    data = combined_dataset %>% mutate(price_log = log(SalePrice)) %>% select(-SalePrice),
    target_var = price_log,
    trans = Trans,    
    threshold = 0
)

tranConfig3

tran_name,tran_fn
log,"function (x) , log(x + 1)"
sqrt,"function (x) , sqrt(x)"
inv3,"function (x) , x^(1/3)"


var,tran_name,progress_score,tran_fn,r2_x,r2_tran
GrLivArea,log,91.157942,"function (x) , log(x + 1)",0.52593095,0.5438044
LotArea,inv3,78.436891,"function (x) , x^(1/3)",0.06788313,0.154396
TotRmsAbvGrd,log,30.685015,"function (x) , log(x + 1)",0.28912299,0.2938676
Fireplaces,inv3,3.739704,"function (x) , x^(1/3)",0.24206172,0.2655916


In [48]:
system.time({
    tranConfig <- TN$get_transformation_config(
        dataset = combined_dataset %>%
                  select(-SalePrice) %>%
                  select_if(is.numeric),
        trans = Trans
    )
})

system.time({
    df1 <- TN$apply_transform(combined_dataset, tranConfig)
})

system.time({
    df2 <- TN$apply_transform2(combined_dataset, tranConfig)
})

test_that("should be equal", {
    expect_equal(df1, df2)
})

   user  system elapsed 
  0.642   0.000   0.642 

   user  system elapsed 
  0.003   0.000   0.003 

   user  system elapsed 
  0.019   0.000   0.019 

In [52]:
df = combined_dataset %>% select(GrLivArea, LotArea)
df %>% head

TN$apply_transform(df, tranConfig) %>% head

GrLivArea,LotArea
1710,8450
1262,9600
1786,11250
1717,9550
2198,14260
1362,14115


GrLivArea,LotArea
7.444833,20.36818
7.141245,21.25317
7.488294,22.40702
7.448916,21.21621
7.695758,24.24971
7.217443,24.16723
