In [3]:
setwd(paste0(Sys.getenv('R_SOURCES'), '/house_prices'))
source('main.R')
house_prices$helpers$import_libs()

combined_dataset <-
    house_prices$helpers$load_data() %>%
    (house_prices$outliers$remove_outliers) %>%
    (house_prices$missing$fix_all) %>%
    mutate(
        price_log = log(SalePrice)
    ) %>%
    select(-SalePrice, -Id)

combined_dataset %>% dim

test_that("should be no NA values except SalePrice column", {
    expect_equal(sum(is.na(combined_dataset %>% select(-price_log))), 0)
})

get_character_colnames <- house_prices$helpers$get_character_colnames

In [12]:
#
# Quantile ratings calculation
#
source('main.R')

TranCateg <- house_prices$trans$categ
QuantileRating <- house_prices$trans$categ$QuantileRating

ratings1 <- QuantileRating$calc_ratings(
    df = combined_dataset,
    target_var = price_log,
    rating_quantiles = QuantileRating$calc_quantiles(
        sample = combined_dataset[['price_log']],
        probs = c(0.25, 0.5, 0.75)
    ),
    categ_vars = get_character_colnames(combined_dataset)
)    

ratings2 <- TranCateg$calc_ratings(combined_dataset, price_log)    

test_that("should be equal ratings", {
    expect_equal(ratings1, ratings2)
})

ratings1 %>% head
ratings1 %>% tail

var,value,rating
Alley,_none_,2.531822
Alley,Grvl,1.5
Alley,Pave,2.536585
BldgType,1Fam,2.564039
BldgType,2fmCon,1.580645
BldgType,Duplex,1.692308


var,value,rating
SaleType,Oth,1.333333
Street,Pave,2.499311
Street,Grvl,1.833333
Utilities,AllPub,2.496911
Utilities,NoSeWa,2.0
,,2.5


In [16]:
#
# Quantile ratings transformation
#
source('main.R')

TranCateg <- house_prices$trans$categ

ratings <- TranCateg$calc_ratings(combined_dataset, price_log)    

df1 <- TranCateg$Tran$rating_transform_for_selected(
    data = combined_dataset,
    columns = get_character_colnames(combined_dataset),
    ratings = ratings
)

df2 <- TranCateg$rating_transform(
    combined_dataset,
    price_log
)

test_that("should be equal", {
    expect_equal(df1, df2)
})

In [19]:
#
# Timing different implementations
#
source('main.R')

TranCateg <- house_prices$trans$categ

rating_transform_for_selected <- TranCateg$Tran$rating_transform_for_selected
rating_transform_for_selected2 <- TranCateg$Tran$rating_transform_for_selected2
rating_transform_for_selected3 <- TranCateg$Tran$rating_transform_for_selected3

ratings <- TranCateg$calc_ratings(combined_dataset, price_log)  

system.time({
    df1 <- rating_transform_for_selected(
        data = combined_dataset,
        columns = get_character_colnames(combined_dataset),
        ratings = ratings
    )
})

system.time({
    df2 <- rating_transform_for_selected2(
        data = combined_dataset,
        columns = get_character_colnames(combined_dataset),
        ratings = ratings
    )
})

system.time({
    df3 <- rating_transform_for_selected3(
        data = combined_dataset,
        columns = get_character_colnames(combined_dataset),
        ratings = ratings
    )
})

test_that("should be equal", {
    expect_equal(df1, df2)
})

test_that("should be equal", {
    expect_equal(df2, df3)
})

   user  system elapsed 
  0.066   0.000   0.066 

   user  system elapsed 
  0.771   0.000   0.771 

   user  system elapsed 
  0.599   0.000   0.600 