In [6]:
setwd(paste0(Sys.getenv('R_SOURCES'), '/house_prices'))
source('main.R')
house_prices$helpers$import_libs()

combined_dataset <-
    house_prices$helpers$load_data() %>%
    (house_prices$outliers$remove_outliers) %>%
    (house_prices$missing$fix_all) %>%
    mutate(
        price_log = log(SalePrice)
    ) %>%
    select(-SalePrice, -Id)

combined_dataset %>% dim

test_that("should be no NA values except SalePrice column", {
    expect_equal(sum(is.na(combined_dataset %>% select(-price_log))), 0)
})

get_character_colnames <- house_prices$helpers$get_character_colnames

In [8]:
#
# Global quantiles
#
source('main.R')

QuantileRating <- house_prices$trans$categ$QuantileRating

QuantileRating$calc_quantiles(
    sample = combined_dataset[['price_log']],
    probs = c(0.25, 0.5, 0.75)
)

In [22]:
#
# Quantile ratings calculation
#
source('main.R')

TranCateg <- house_prices$trans$categ
QuantileRating <- house_prices$trans$categ$QuantileRating

ratings1 <- QuantileRating$calc_ratings(
    df = combined_dataset,
    target_var = price_log,
    rating_quantiles = QuantileRating$calc_quantiles(
        sample = combined_dataset[['price_log']],
        probs = c(0.25, 0.5, 0.75)
    ),
    categ_vars = get_character_colnames(combined_dataset)
)    

ratings2 <- TranCateg$calc_ratings(combined_dataset, price_log)    

test_that("should be equal ratings", {
    expect_equal(ratings1, ratings2)
})

In [19]:
ratings1 %>% arrange(var, rating) %>% head(9)

var,value,rating
Alley,Grvl,1.5
Alley,_none_,2.531822
Alley,Pave,2.536585
BldgType,2fmCon,1.580645
BldgType,Duplex,1.692308
BldgType,Twnhs,1.883721
BldgType,1Fam,2.564039
BldgType,TwnhsE,2.622807
BsmtCond,Po,1.0


In [21]:
ratings1 %>% arrange(var, rating) %>% tail(9)

var,value,rating
SaleType,WD,2.421468
SaleType,CWD,3.0
SaleType,New,3.583333
SaleType,Con,4.0
Street,Grvl,1.833333
Street,Pave,2.499311
Utilities,NoSeWa,2.0
Utilities,AllPub,2.496911
,,2.5


In [24]:
#
# Quantile ratings transformation
#
source('main.R')

TranCateg <- house_prices$trans$categ

ratings <- TranCateg$calc_ratings(combined_dataset, price_log)    

df1 <- TranCateg$Tran$rating_transform_for_selected(
    data = combined_dataset,
    columns = get_character_colnames(combined_dataset),
    ratings = ratings
)

df2 <- TranCateg$rating_transform(
    combined_dataset,
    price_log
)

test_that("should be equal", {
    expect_equal(df1, df2)
})

df1 %>% head(5)

Alley,BldgType,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,BsmtQual,CentralAir,Condition1,Condition2,⋯,WoodDeckSF,OpenPorchSF,EnclosedPorch,X3SsnPorch,ScreenPorch,PoolArea,MiscVal,YrSold,dataSource,price_log
2.531822,2.564039,2.540871,2.320042,3.262019,2.553429,3.066343,2.57887,2.547619,2.50277,⋯,0,61,0,0,0,0,0,2008,train,12.24769
2.531822,2.564039,2.540871,3.295455,2.231818,2.553429,3.066343,2.57887,1.9875,2.50277,⋯,298,0,0,0,0,0,0,2007,train,12.10901
2.531822,2.564039,2.540871,2.675439,3.262019,2.553429,3.066343,2.57887,2.547619,2.50277,⋯,0,42,0,0,0,0,0,2008,train,12.31717
2.531822,2.564039,3.046154,2.320042,2.231818,2.553429,1.847458,2.57887,2.547619,2.50277,⋯,0,35,272,0,0,0,0,2006,train,11.8494
2.531822,2.564039,2.540871,2.900452,3.262019,2.553429,3.066343,2.57887,2.547619,2.50277,⋯,192,84,0,0,0,0,0,2008,train,12.42922


In [5]:
#
# Timing different implementations
#
source('main.R')

TranCateg <- house_prices$trans$categ

rating_transform_for_selected <- TranCateg$Tran$rating_transform_for_selected
rating_transform_for_selected2 <- TranCateg$Tran$rating_transform_for_selected2
rating_transform_for_selected3 <- TranCateg$Tran$rating_transform_for_selected3

ratings <- TranCateg$calc_ratings(combined_dataset, price_log)  

system.time({
    df1 <- rating_transform_for_selected(
        data = combined_dataset,
        columns = get_character_colnames(combined_dataset),
        ratings = ratings
    )
})

system.time({
    df2 <- rating_transform_for_selected2(
        data = combined_dataset,
        columns = get_character_colnames(combined_dataset),
        ratings = ratings
    )
})

system.time({
    df3 <- rating_transform_for_selected3(
        data = combined_dataset,
        columns = get_character_colnames(combined_dataset),
        ratings = ratings
    )
})

test_that("should be equal", {
    expect_equal(df1, df2)
})

test_that("should be equal", {
    expect_equal(df2, df3)
})

   user  system elapsed 
  0.072   0.008   0.080 

   user  system elapsed 
  0.867   0.000   0.868 

   user  system elapsed 
  0.658   0.000   0.658 