In [2]:
setwd(paste0(Sys.getenv('R_SOURCES'), '/house_prices'))
source('main.R')
house_prices$helpers$import_libs()

combined_dataset <-
    house_prices$helpers$load_data() %>%
    (house_prices$outliers$remove_outliers) %>%
    (house_prices$missing$fix_all) %>%
    mutate(
        price_log = log(SalePrice)
    ) %>%
    select(-SalePrice, -Id)

combined_dataset %>% dim

test_that("should be no NA values except SalePrice column", {
    expect_equal(sum(is.na(combined_dataset %>% select(-price_log))), 0)
})

In [38]:
QuantileRating <- house_prices$trans$categ$QuantileRating

ratings <- QuantileRating$calc_ratings(
    df = combined_dataset,
    target_var = price_log,
    rating_quantiles = QuantileRating$calc_quantiles(
        sample = combined_dataset[['price_log']],
        probs = c(0.25, 0.5, 0.75)
    ),
    categ_vars = house_prices$helpers$get_character_colnames(combined_dataset)
)

ratings %>% head(7)

var,value,rating
Alley,_none_,2.531822
Alley,Grvl,1.5
Alley,Pave,2.536585
BldgType,1Fam,2.564039
BldgType,2fmCon,1.580645
BldgType,Duplex,1.692308
BldgType,TwnhsE,2.622807


In [13]:
default_rating <- ratings[is.na(ratings$var),]$rating
default_rating

In [16]:
step1 <-
    combined_dataset %>% 
    select(house_prices$helpers$get_character_colnames(combined_dataset)) %>%
    gather(var, value) 

step1 %>% head

var,value
Alley,_none_
Alley,_none_
Alley,_none_
Alley,_none_
Alley,_none_
Alley,_none_


In [17]:
step2 <-
    step1 %>%
    left_join(ratings, by=c('var', 'value'))

step2 %>% head

var,value,rating
Alley,_none_,2.531822
Alley,_none_,2.531822
Alley,_none_,2.531822
Alley,_none_,2.531822
Alley,_none_,2.531822
Alley,_none_,2.531822


In [31]:
step2 %>% filter(is.na(rating))

ratings %>% filter(var == 'MSSubClass' | var == 'Utilities') %>% arrange(var, value)

var,value,rating
MSSubClass,150,
Utilities,_none_,
Utilities,_none_,


var,value,rating
MSSubClass,120,2.954023
MSSubClass,160,1.904762
MSSubClass,180,1.3
MSSubClass,190,1.6
MSSubClass,20,2.522388
MSSubClass,30,1.086957
MSSubClass,40,2.25
MSSubClass,45,1.25
MSSubClass,50,1.833333
MSSubClass,60,3.454545


In [32]:
?replace_na

In [35]:
step2 <-
    step1 %>%
    left_join(ratings, by=c('var', 'value')) %>%
    replace_na(list(rating=default_rating))

step2 %>% head

step2 %>% filter(is.na(rating)) %>% nrow

var,value,rating
Alley,_none_,2.531822
Alley,_none_,2.531822
Alley,_none_,2.531822
Alley,_none_,2.531822
Alley,_none_,2.531822
Alley,_none_,2.531822


In [39]:
step3 <-
    step2 %>%
    select(var, rating) %>%
    group_by(var) %>%
    mutate(id = row_number()) %>%
    spread(var, rating) %>%
    select(-id)

step3 %>% head

Alley,BldgType,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,BsmtQual,CentralAir,Condition1,Condition2,⋯,MSZoning,Neighborhood,PavedDrive,PoolQC,RoofMatl,RoofStyle,SaleCondition,SaleType,Street,Utilities
2.531822,2.564039,2.540871,2.320042,3.262019,2.553429,3.066343,2.57887,2.547619,2.50277,⋯,2.665796,2.966667,2.584454,2.491736,2.483601,2.418054,2.451586,2.421468,2.499311,2.496911
2.531822,2.564039,2.540871,3.295455,2.231818,2.553429,3.066343,2.57887,1.9875,2.50277,⋯,2.665796,3.454545,2.584454,2.491736,2.483601,2.418054,2.451586,2.421468,2.499311,2.496911
2.531822,2.564039,2.540871,2.675439,3.262019,2.553429,3.066343,2.57887,2.547619,2.50277,⋯,2.665796,2.966667,2.584454,2.491736,2.483601,2.418054,2.451586,2.421468,2.499311,2.496911
2.531822,2.564039,3.046154,2.320042,2.231818,2.553429,1.847458,2.57887,2.547619,2.50277,⋯,2.665796,3.156863,2.584454,2.491736,2.483601,2.418054,1.920792,2.421468,2.499311,2.496911
2.531822,2.564039,2.540871,2.900452,3.262019,2.553429,3.066343,2.57887,2.547619,2.50277,⋯,2.665796,3.97561,2.584454,2.491736,2.483601,2.418054,2.451586,2.421468,2.499311,2.496911
2.531822,2.564039,2.540871,2.320042,3.262019,2.553429,3.066343,2.57887,2.547619,2.50277,⋯,2.665796,2.265306,2.584454,2.491736,2.483601,2.418054,2.451586,2.421468,2.499311,2.496911
