In [16]:
setwd(paste0(Sys.getenv('R_SOURCES'), '/house_prices'))
source('main.R')
house_prices$helpers$import_libs()

combined_dataset <-
    house_prices$helpers$load_data() %>%
    (house_prices$outliers$remove_outliers) %>%
    (house_prices$missing$fix_all) %>%
    mutate(
        price_log = log(SalePrice)
    ) %>%
    select(-SalePrice, -Id)

training_dataset <- 
    combined_dataset %>% 
    filter(dataSource == 'train')

test_that("should be no NA values except SalePrice column", {
    expect_equal(sum(is.na(combined_dataset %>% select(-price_log))), 0)
})

test_that("should be no NA values in training_dataset", {
    expect_equal(sum(is.na(training_dataset)), 0)
})

get_character_colnames <- house_prices$helpers$get_character_colnames
tran_categ <- house_prices$trans$categ

In [19]:
#
# Quantile ratings calculation
#
source('main.R')

calc_rating_for_all <- tran_categ$quantile_rating$calc_rating_for_all
calc_rating_for_selected <- tran_categ$quantile_rating$calc_rating_for_selected

rating_for_selected <- calc_rating_for_selected(
    df = combined_dataset, 
    categ_vars_for_fix = get_character_colnames(combined_dataset),
    target_var = price_log
)    

ratings_for_all <- calc_rating_for_all(combined_dataset, price_log)    

test_that("should be equal ratings", {
    expect_equal(ratings_for_all, rating_for_selected)
})

ratings_for_all %>% head

var,value,rating
Alley,_none_,2.531822
Alley,Grvl,1.5
Alley,Pave,2.536585
BldgType,1Fam,2.564039
BldgType,2fmCon,1.580645
BldgType,Duplex,1.692308


In [21]:
#
# Quantile ratings transformation
#
source('main.R')

calc_rating_for_all <- tran_categ$quantile_rating$calc_rating_for_all
rating_transform_for_selected <- tran_categ$methods$rating_transform_for_selected
rating_transform <- tran_categ$rating_transform

ratings <- calc_rating_for_all(combined_dataset, price_log)

df1 <- rating_transform_for_selected(
    data = combined_dataset,
    columns = get_character_colnames(combined_dataset),
    ratings = ratings
)

df2 <- rating_transform(
    combined_dataset,
    price_log
)

test_that("should be equal", {
    expect_equal(df1, df2)
})

In [24]:
#
# Timing different implementations
#

source('main.R')

rating_transform_for_selected <- tran_categ$methods$rating_transform_for_selected
rating_transform_for_selected2 <- tran_categ$methods$rating_transform_for_selected2
rating_transform_for_selected3 <- tran_categ$methods$rating_transform_for_selected3

ratings <- calc_rating_for_all(combined_dataset, price_log)

system.time({
    df1 <- rating_transform_for_selected(
        data = combined_dataset,
        columns = get_character_colnames(combined_dataset),
        ratings = ratings
    )
})

system.time({
    df2 <- rating_transform_for_selected2(
        data = combined_dataset,
        columns = get_character_colnames(combined_dataset),
        ratings = ratings
    )
})

system.time({
    df3 <- rating_transform_for_selected3(
        data = combined_dataset,
        columns = get_character_colnames(combined_dataset),
        ratings = ratings
    )
})

test_that("should be equal", {
    expect_equal(df1, df2)
})

test_that("should be equal", {
    expect_equal(df1, df3)
})

   user  system elapsed 
  0.067   0.004   0.071 

   user  system elapsed 
   0.77    0.00    0.77 

   user  system elapsed 
  0.623   0.000   0.623 