In [7]:
setwd(paste0(Sys.getenv('R_SOURCES'), '/house_prices'))
source('main.R')
house_prices$helpers$import_libs()

combined_dataset <-
    house_prices$helpers$load_data() %>%
    (house_prices$outliers$remove_outliers) %>%
    (house_prices$missing$fix_all) %>%
    select(-Id)

test_that("should be no NA values except SalePrice column", {
    expect_equal(sum(is.na(combined_dataset %>% select(-SalePrice))), 0)
})

combined_dataset %>% dim
house_prices$helpers$load_data() %>% dim

### Passing named list to mutate (and probably other dplyr verbs)
https://community.rstudio.com/t/passing-named-list-to-mutate-and-probably-other-dplyr-verbs/2553/6

In [8]:
fn_df <- tribble(
    ~tran_name, ~tran_defin,
    'log',      function(x) log(x+1),
    'sqrt',     function(x) sqrt(x),
    'invcube',  function(x) x**(1/3)
)

for (row in 1:nrow(fn_df)) {
    fn_df[[row, "tran_defin"]](1) %>% print
}

[1] 0.6931472
[1] 1
[1] 1


In [9]:
combined_dataset %>%
select(-SalePrice) %>%
select_if(is.numeric) %>%
gather(var, x) %>%
filter(!is.na(x)) -> df1

df1 %>% head
df1 %>% nrow

for (row in 1:nrow(fn_df)) {
    df1[fn_df[row, "tran_name"]$tran_name] <- (fn_df[row, "tran_defin"]$tran_defin)[[1]](df1$x)
}

df1 %>% head

var,x
LotFrontage,65
LotFrontage,80
LotFrontage,68
LotFrontage,60
LotFrontage,84
LotFrontage,85


var,x,log,sqrt,invcube
LotFrontage,65,4.189655,8.062258,4.020726
LotFrontage,80,4.394449,8.944272,4.308869
LotFrontage,68,4.234107,8.246211,4.081655
LotFrontage,60,4.110874,7.745967,3.914868
LotFrontage,84,4.442651,9.165151,4.379519
LotFrontage,85,4.454347,9.219544,4.39683


In [26]:
df1 %>%
gather(tran, value, -var) %>%
group_by(var, tran) %>%
mutate(
    value_normed = (value - mean(value)) / sd(value)
) %>%
select(-value) -> df2

df2 %>% 
filter(var == 'LotArea') %>% 
filter(row_number() < 3)

var,tran,value_normed
LotArea,x,-0.21639955
LotArea,x,-0.06909653
LotArea,log,-0.10174374
LotArea,log,0.14940974
LotArea,sqrt,-0.21081495
LotArea,sqrt,0.02444502
LotArea,invcube,-0.18204458
LotArea,invcube,0.06700854


In [47]:
df2 %>%
group_by(var, tran, value_normed) %>%
summarise(
    k = n()
) %>%
arrange(var, tran, value_normed) -> df3

df3 %>% filter(var == 'YrSold')

var,tran,value_normed,k
YrSold,invcube,-1.3633378,619
YrSold,invcube,-0.6027227,691
YrSold,invcube,0.1576398,621
YrSold,invcube,0.9177499,647
YrSold,invcube,1.6776078,339
YrSold,log,-1.363451,619
YrSold,log,-0.602662,691
YrSold,log,0.1577483,621
YrSold,log,0.9177802,647
YrSold,log,1.677434,339


In [51]:
df3 %>%
mutate(
    empirical = cumsum(k) / sum(k),
    theoretical = pnorm(value_normed),
    diff_L2 = k*(empirical - theoretical)**2
) -> df4

df4 %>% filter(var == 'YrSold')

var,tran,value_normed,k,empirical,theoretical,diff_L2
YrSold,invcube,-1.3633378,619,0.2122043,0.08638803,9.7986077
YrSold,invcube,-0.6027227,691,0.4490915,0.27334658,21.3424242
YrSold,invcube,0.1576398,621,0.6619815,0.56262967,6.1297564
YrSold,invcube,0.9177499,647,0.8837847,0.82062508,2.5809725
YrSold,invcube,1.6776078,339,1.0,0.95328815,0.7396969
YrSold,log,-1.363451,619,0.2122043,0.08637021,9.8013845
YrSold,log,-0.602662,691,0.4490915,0.2733668,21.3375152
YrSold,log,0.1577483,621,0.6619815,0.56267243,6.1244811
YrSold,log,0.9177802,647,0.8837847,0.82063301,2.5803244
YrSold,log,1.677434,339,1.0,0.95327118,0.7402346


In [53]:
df4 %>%
group_by(var, tran) %>%
summarise(
    L2_distance = sum(diff_L2)
) -> df5

df5 %>% filter(var == 'LotArea')

var,tran,L2_distance
LotArea,invcube,10.78862
LotArea,log,13.31376
LotArea,sqrt,13.39006
LotArea,x,50.03278


In [58]:
df5 %>%
group_by(var) %>%
nest %>%
mutate(
    best_tran = map(data, function(df) {
        best <- df %>% arrange(L2_distance) %>% head(1)
        vanilla <- df %>% filter(tran == 'x')
        progress_score <- 100 * (vanilla$L2_distance - best$L2_distance) / vanilla$L2_distance
        data_frame(tran = best$tran, progress_score = progress_score)
    })
) %>%
select(-data) -> df6

df6 %>% filter(var == 'LotArea')

var,best_tran
LotArea,"invcube , 78.436890821096"


In [61]:
df6 %>%
unnest(best_tran) %>%
filter(tran != 'x') %>%
arrange(desc(progress_score)) %>%
inner_join(fn_df, by=c("tran" = "tran_name")) -> transformation_config

transformation_config

var,tran,progress_score,tran_defin
GrLivArea,log,91.15794157,"function (x) , log(x + 1)"
X1stFlrSF,log,90.77165686,"function (x) , log(x + 1)"
BsmtUnfSF,sqrt,79.9760307,"function (x) , sqrt(x)"
LotArea,invcube,78.43689082,"function (x) , x^(1/3)"
TotRmsAbvGrd,log,30.68501506,"function (x) , log(x + 1)"
OverallQual,log,29.94435652,"function (x) , log(x + 1)"
GarageCars,log,24.95676474,"function (x) , log(x + 1)"
BedroomAbvGr,log,22.37508601,"function (x) , log(x + 1)"
OverallCond,log,16.24723935,"function (x) , log(x + 1)"
FullBath,invcube,8.79733251,"function (x) , x^(1/3)"


In [167]:
combined_dataset %>%
select(one_of(transformation_config$var)) %>%
gather(var, value) %>%
filter(!is.na(value)) %>%
inner_join(transformation_config %>% select(var, tran, tran_defin), by='var') %>%
head(2) %>%
mutate(
    value_transformed = map2_dbl(value, tran_defin, function(val, fn) fn(val))
) %>%
select(var, value, value_transformed) %>%
gather(tran, value, -var) %>%
mutate(
    tran = ifelse(tran == 'value', 'original', 'transformed')
)

var,tran,value
GrLivArea,original,1710.0
GrLivArea,original,1262.0
GrLivArea,transformed,7.444833
GrLivArea,transformed,7.141245
