In [51]:
setwd(paste0(Sys.getenv('R_SOURCES'), '/house_prices'))
source('main.R')
house_prices$helpers$import_libs()

combined_dataset <-
    house_prices$helpers$load_data() %>%
    (house_prices$outliers$remove_outliers) %>%
    (house_prices$missing$fix_all) %>%
    mutate(
        price_log = log(SalePrice)
    ) %>%
    select(-SalePrice, -Id)

training_dataset <- 
    combined_dataset %>% 
    filter(dataSource == 'train')

test_that("should be no NA values except SalePrice column", {
    expect_equal(sum(is.na(combined_dataset %>% select(-price_log))), 0)
})

test_that("should be no NA values in training_dataset", {
    expect_equal(sum(is.na(training_dataset)), 0)
})

In [52]:
get_character_colnames <- house_prices$helpers$get_character_colnames

calc_rating_for_all <- house_prices$trans$categ$calc_rating_for_all
calc_rating_for_selected <- house_prices$trans$categ$calc_rating_for_selected

rating_transform_for_selected <- house_prices$trans$categ$rating_transform_for_selected
rating_transform_for_selected2 <- house_prices$trans$categ$rating_transform_for_selected2
rating_transform_for_selected3 <- house_prices$trans$categ$rating_transform_for_selected3
rating_transform <- house_prices$trans$categ$rating_transform

In [53]:
source('main.R')

system.time({
    rating_for_selected <- calc_rating_for_selected(
        df = combined_dataset, 
        categ_vars_for_fix = get_character_colnames(combined_dataset),
        target_var = price_log
    )    
})

system.time({
    ratings_for_all <- calc_rating_for_all(combined_dataset, price_log)    
})

test_that("should be equal ratings", {
    expect_equal(ratings_for_all, rating_for_selected)
})

ratings_for_all %>% head

   user  system elapsed 
  0.227   0.000   0.227 

   user  system elapsed 
  0.231   0.000   0.231 

var,value,rating
Alley,_none_,2.531822
Alley,Grvl,1.5
Alley,Pave,2.536585
BldgType,1Fam,2.564039
BldgType,2fmCon,1.580645
BldgType,Duplex,1.692308


In [54]:
source('main.R')

ratings <- calc_rating_for_all(combined_dataset, price_log)

system.time({
    df1 <- rating_transform_for_selected(
        data = combined_dataset,
        columns = house_prices$helpers$get_character_colnames(combined_dataset),
        ratings = ratings
    )
})

system.time({
    df2 <- rating_transform(
        combined_dataset,
        price_log
    )
})

test_that("should be equal", {
    expect_equal(df1, df2)
})

   user  system elapsed 
  0.067   0.000   0.067 

   user  system elapsed 
  0.291   0.004   0.295 

In [55]:
source('main.R')

ratings <- calc_rating_for_all(combined_dataset, price_log)

system.time({
    df1 <- rating_transform_for_selected(
        data = combined_dataset,
        columns = get_character_colnames(combined_dataset),
        ratings = ratings
    )
})

system.time({
    df2 <- rating_transform_for_selected2(
        data = combined_dataset,
        columns = get_character_colnames(combined_dataset),
        ratings = ratings
    )
})

system.time({
    df3 <- rating_transform_for_selected3(
        data = combined_dataset,
        columns = get_character_colnames(combined_dataset),
        ratings = ratings
    )
})

test_that("should be equal", {
    expect_equal(df1, df2)
})

test_that("should be equal", {
    expect_equal(df1, df3)
})

   user  system elapsed 
  0.061   0.008   0.069 

   user  system elapsed 
   0.89    0.00    0.89 

   user  system elapsed 
  0.617   0.000   0.617 

In [69]:
source('main.R')
house_prices$trans$numeric$get_transformation_config