### Transforming categorical variable to numeric

Custom "rating transformation" is applied. Algo is:

1. Calculate global distribution of a target variable (price_log).
2. Split it in 4 ranges from low to high. Assign rating to each range: 1, 2, 3, 4.
3. For a given categ attribute find marginal target distributions for it's values.
4. Calculate mean rating relative to marginal distribution.

In [51]:
setwd(paste0(Sys.getenv('R_SOURCES'), '/house_prices'))

house_prices <- source('main.R', local = TRUE)$value

combined_dataset <-
    house_prices$helpers$load_data() %>%
    (house_prices$outliers$remove_outliers) %>%
    (house_prices$missing$fix_all) %>%
    mutate(
        price_log = log(SalePrice)
    ) %>%
    select(-SalePrice, -Id)

combined_dataset %>% dim %>% print

test_that("should be no NA values except `price_log` column", {
    expect_equal(sum(is.na(combined_dataset %>% select(-price_log))), 0)
})

get_character_colnames <- house_prices$helpers$get_character_colnames
showtable <- house_prices$helpers$showtable

[1] 2917   81


In [52]:
#
# Global quantiles
#
QuantileRating <- house_prices$trans$categ$QuantileRating

QuantileRating$calc_quantiles(
    sample = combined_dataset[['price_log']],
    probs = c(0.25, 0.5, 0.75)
) %>% print

     25%      50%      75% 
11.77471 12.00151 12.27373 


In [54]:
#
# Quantile ratings calculation
#
TranCateg <- house_prices$trans$categ
QuantileRating <- house_prices$trans$categ$QuantileRating

ratings1 <- QuantileRating$calc_ratings(
    df = combined_dataset,
    target_var = price_log,
    rating_quantiles = QuantileRating$calc_quantiles(
        sample = combined_dataset[['price_log']],
        probs = c(0.25, 0.5, 0.75)
    ),
    categ_vars = get_character_colnames(combined_dataset)
)    

ratings2 <- TranCateg$calc_ratings(combined_dataset, price_log)    

test_that("should be equal ratings", {
    expect_equal(ratings1, ratings2)
})

showtable(ratings1 %>% arrange(var, rating) %>% head(30), cols = 4)

var,value,rating
Alley,Grvl,1.5
Alley,_none_,2.531822
Alley,Pave,2.536585
BldgType,2fmCon,1.580645
BldgType,Duplex,1.692308
BldgType,Twnhs,1.883721
BldgType,1Fam,2.564039
BldgType,TwnhsE,2.622807
BsmtCond,Po,1.0
BsmtCond,_none_,1.216216


In [10]:
#
# Quantile ratings transformation
#
TranCateg <- house_prices$trans$categ

ratings <- TranCateg$calc_ratings(combined_dataset, price_log)    

df1 <- TranCateg$Tran$rating_transform_for_selected(
    data = combined_dataset,
    columns = get_character_colnames(combined_dataset),
    ratings = ratings
)

df2 <- TranCateg$rating_transform(
    combined_dataset,
    price_log
)

test_that("should be equal", {
    expect_equal(df1, df2)
})

df1 %>% head(5)

Alley,BldgType,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,BsmtQual,CentralAir,Condition1,Condition2,⋯,WoodDeckSF,OpenPorchSF,EnclosedPorch,X3SsnPorch,ScreenPorch,PoolArea,MiscVal,YrSold,dataSource,price_log
2.531822,2.564039,2.540871,2.320042,3.262019,2.553429,3.066343,2.57887,2.547619,2.50277,⋯,0,61,0,0,0,0,0,2008,train,12.24769
2.531822,2.564039,2.540871,3.295455,2.231818,2.553429,3.066343,2.57887,1.9875,2.50277,⋯,298,0,0,0,0,0,0,2007,train,12.10901
2.531822,2.564039,2.540871,2.675439,3.262019,2.553429,3.066343,2.57887,2.547619,2.50277,⋯,0,42,0,0,0,0,0,2008,train,12.31717
2.531822,2.564039,3.046154,2.320042,2.231818,2.553429,1.847458,2.57887,2.547619,2.50277,⋯,0,35,272,0,0,0,0,2006,train,11.8494
2.531822,2.564039,2.540871,2.900452,3.262019,2.553429,3.066343,2.57887,2.547619,2.50277,⋯,192,84,0,0,0,0,0,2008,train,12.42922


In [11]:
system.time({
    devnull <- TranCateg$calc_ratings(combined_dataset, price_log)  
})

   user  system elapsed 
  0.149   0.000   0.148 

In [12]:
#
# Timing different implementations
#
source('main.R')

TranCateg <- house_prices$trans$categ

rating_transform_for_selected <- TranCateg$Tran$rating_transform_for_selected
rating_transform_for_selected2 <- TranCateg$Tran$rating_transform_for_selected2
rating_transform_for_selected3 <- TranCateg$Tran$rating_transform_for_selected3

ratings <- TranCateg$calc_ratings(combined_dataset, price_log)  

system.time({
    df1 <- rating_transform_for_selected(
        data = combined_dataset,
        columns = get_character_colnames(combined_dataset),
        ratings = ratings
    )
})

system.time({
    df2 <- rating_transform_for_selected2(
        data = combined_dataset,
        columns = get_character_colnames(combined_dataset),
        ratings = ratings
    )
})

system.time({
    df3 <- rating_transform_for_selected3(
        data = combined_dataset,
        columns = get_character_colnames(combined_dataset),
        ratings = ratings
    )
})

test_that("should be equal", {
    expect_equal(df1, df2)
})

test_that("should be equal", {
    expect_equal(df2, df3)
})

   user  system elapsed 
  0.060   0.004   0.064 

   user  system elapsed 
  0.775   0.000   0.775 

   user  system elapsed 
  0.608   0.000   0.608 