In [39]:
setwd(paste0(Sys.getenv('R_SOURCES'), '/house_prices'))
getwd()

In [59]:
source('main.R')
house_prices$helpers$import_libs()

combined_dataset <- house_prices$helpers$load_data()

combined_dataset_fixed <-
    combined_dataset %>%
    (house_prices$missing$categ$replace_with_most_common) %>%
    (house_prices$missing$categ$fix_valid) %>%
    (house_prices$missing$numeric$replace_with_zero)

training_dataset <- combined_dataset_fixed %>% filter(dataSource == 'train')


test_that("should be no NA values in training_dataset", {
    expect_equal(sum(is.na(training_dataset)), 0)
})

In [61]:
categ_data <- 
    training_dataset %>%
    # select categ vars
    select(house_prices$helpers$get_character_colnames(combined_dataset_fixed), SalePrice, -dataSource) %>%
    # drop weak categ vars
    select(-one_of(house_prices$attributes_selection$discared_cat_vars)) %>%
    mutate(
        price_log = log(SalePrice)
    ) %>%
    select(-SalePrice)

categ_data %>% colnames %>% sort

In [103]:
categ_data %>%
    select(price_log) %>%
    summarise(
        q25 = quantile(price_log, 0.25),
        q50 = quantile(price_log, 0.5),
        q75 = quantile(price_log, 0.75)
    ) -> global_quantiles

global_quantiles

q25,q50,q75
11.7751,12.00151,12.27373


In [120]:
categ_data %>%
gather(var, value, -price_log) %>%
group_by(var, value) %>%
nest %>%
mutate(
    dist = map(data, ~ecdf(.$price_log)),
    rating = map_dbl(dist, function (cdf) {
        prob_rating_1 <- cdf(global_quantiles$q25)
        prob_rating_2 <- cdf(global_quantiles$q50) - cdf(global_quantiles$q25)
        prob_rating_3 <- cdf(global_quantiles$q75) - cdf(global_quantiles$q50)
        prob_rating_4 <- 1 - cdf(global_quantiles$q75)
        group_rating <- prob_rating_1 * 1 + prob_rating_2 * 2 + prob_rating_3 * 3 + prob_rating_4 * 4
        group_rating
    })
) %>%
select(-dist) %>% 
unnest -> xxx

In [131]:
xxx %>%
group_by(var, value, rating) %>%
summarise(
    n = n(),
    mean = mean(price_log),
    median = median(price_log)
) %>%
arrange(var, rating) -> yyy

yyy %>% head

var,value,rating,n,mean,median
Alley,Grvl,1.5,50,11.67335,11.69106
Alley,_none_,2.531775,1369,12.03768,12.0137
Alley,Pave,2.536585,41,11.99681,12.05815
BldgType,2fmCon,1.580645,31,11.72535,11.75587
BldgType,Duplex,1.692308,52,11.78092,11.82026
BldgType,Twnhs,1.883721,43,11.77391,11.83138


In [143]:
config <-
    yyy %>%
        ungroup %>%
    select(var, value, rating)

config %>% head(10)

var,value,rating
Alley,Grvl,1.5
Alley,_none_,2.531775
Alley,Pave,2.536585
BldgType,2fmCon,1.580645
BldgType,Duplex,1.692308
BldgType,Twnhs,1.883721
BldgType,1Fam,2.563934
BldgType,TwnhsE,2.622807
BsmtCond,Po,1.0
BsmtCond,_none_,1.216216


In [170]:
config %>%
filter(var == 'Alley') %>%
select(value, rating) -> tmp

mapping <- structure(as.list(tmp$rating), names = as.list(tmp$value))

mapping
mapping[['Grvl']]

In [199]:
categ_data %>%
select(Alley) %>% 
`[[`(1) -> z

df <- data_frame(map(z, ~mapping[[.]]))

names(df) <- 'Alley'
df %>% head

Alley
2.531775
2.531775
2.531775
2.531775
2.531775
2.531775


In [202]:
categ_data %>% select(-price_log) %>% colnames %>% 
map_dfc(function (col) {
    
    tmp <-
        config %>%
        filter(var == col) %>%
        select(value, rating)
    
    mapping <- structure(as.list(tmp$rating), names = as.list(tmp$value))

    df <- data_frame(
        map(categ_data[, col] [[1]], ~mapping[[.]])
    )
    names(df) <- col
    df
}) %>%
mutate(
    price_log = categ_data$price_log
) %>% 
head

Alley,BldgType,BsmtCond,BsmtExposure,BsmtFinType1,BsmtQual,CentralAir,Electrical,Exterior1st,Exterior2nd,⋯,MiscFeature,MSSubClass,MSZoning,Neighborhood,PavedDrive,PoolQC,RoofStyle,SaleCondition,SaleType,price_log
2.531775,2.563934,2.540809,2.320042,3.258373,3.066343,2.578755,2.59176,3.046602,3.05754,⋯,2.512091,3.448161,2.665508,2.966667,2.584328,2.492085,2.418054,2.451586,2.421468,12.24769
2.531775,2.563934,2.540809,3.283582,2.231818,3.066343,2.578755,2.59176,1.968182,1.96729,⋯,2.512091,2.522388,2.665508,3.454545,2.584328,2.492085,2.418054,2.451586,2.421468,12.10901
2.531775,2.563934,2.540809,2.675439,3.258373,3.066343,2.578755,2.59176,3.046602,3.05754,⋯,2.512091,3.448161,2.665508,2.966667,2.584328,2.492085,2.418054,2.451586,2.421468,12.31717
2.531775,2.563934,3.046154,2.320042,2.231818,1.847458,2.578755,2.59176,1.980583,2.052632,⋯,2.512091,2.366667,2.665508,3.156863,2.584328,2.492085,2.418054,1.920792,2.421468,11.8494
2.531775,2.563934,2.540809,2.900452,3.258373,3.066343,2.578755,2.59176,3.046602,3.05754,⋯,2.512091,3.448161,2.665508,3.97561,2.584328,2.492085,2.418054,2.451586,2.421468,12.42922
2.531775,2.563934,2.540809,2.320042,3.258373,3.066343,2.578755,2.59176,3.046602,3.05754,⋯,2.061224,1.833333,2.665508,2.265306,2.584328,2.492085,2.418054,2.451586,2.421468,11.8706
