In [1]:
setwd(paste0(Sys.getenv('R_SOURCES'), '/house_prices'))
getwd()

In [67]:
source('main.R')
house_prices$helpers$import_libs()

combined_dataset <-
    house_prices$helpers$load_data() %>%
    (house_prices$outliers$remove_outliers) %>%
    (house_prices$missing$categ$replace_with_most_common) %>%
    (house_prices$missing$categ$fix_valid) %>%
    (house_prices$missing$numeric$replace_with_zero) %>%
    mutate(
        price_log = log(SalePrice)
    ) %>%
    select(-SalePrice, -Id)

training_dataset <- 
    combined_dataset %>% 
    filter(dataSource == 'train')


test_that("should be no NA values except SalePrice column", {
    expect_equal(sum(is.na(combined_dataset %>% select(-price_log))), 0)
})

test_that("should be no NA values in training_dataset", {
    expect_equal(sum(is.na(training_dataset)), 0)
})

In [31]:
source('main.R')

house_prices$helpers$get_character_colnames(combined_dataset)

start_time <- Sys.time()
rating_for_selected <- 
    house_prices$trans$categ$calc_rating_for_selected(
        df = combined_dataset, 
        categ_vars_for_fix = house_prices$helpers$get_character_colnames(combined_dataset),
        target_var = price_log
    )
end_time <- Sys.time()
end_time - start_time

start_time <- Sys.time()
ratings_for_all <- 
    house_prices$trans$categ$calc_rating_for_all(combined_dataset, price_log)
end_time <- Sys.time()
end_time - start_time

test_that("should be equal ratings", {
    expect_equal(ratings_for_all, rating_for_selected)
})

ratings_for_all %>% head

Time difference of 0.2551982 secs

Time difference of 0.3786795 secs

var,value,rating
Alley,_none_,2.531822
Alley,Grvl,1.5
Alley,Pave,2.536585
BldgType,1Fam,2.564039
BldgType,2fmCon,1.580645
BldgType,Duplex,1.692308


In [263]:
source('main.R')

ratings <- 
    house_prices$trans$categ$calc_rating_for_all(combined_dataset, price_log)

start_time <- Sys.time()
df1 <- 
    house_prices$trans$categ$rating_transform_for_selected(
        data = combined_dataset,
        columns = house_prices$helpers$get_character_colnames(combined_dataset),
        ratings = ratings
    )
end_time <- Sys.time()
end_time - start_time

start_time <- Sys.time()
df2 <- 
    house_prices$trans$categ$rating_transform(
        combined_dataset,
        price_log
    )
end_time <- Sys.time()
end_time - start_time

test_that("should be equal", {
    expect_equal(df1, df2)
})

Time difference of 0.08403301 secs

Time difference of 0.316828 secs

In [264]:
source('main.R')

ratings <- 
    house_prices$trans$categ$calc_rating_for_all(combined_dataset, price_log)

start_time <- Sys.time()
df1 <- 
    house_prices$trans$categ$rating_transform_for_selected(
        data = combined_dataset,
        columns = house_prices$helpers$get_character_colnames(combined_dataset),
        ratings = ratings
    )
end_time <- Sys.time()
end_time - start_time


start_time <- Sys.time()
df2 <-
    house_prices$trans$categ$rating_transform_for_selected2(
        data = combined_dataset,
        columns = house_prices$helpers$get_character_colnames(combined_dataset),
        ratings = ratings
    )
end_time <- Sys.time()
end_time - start_time


start_time <- Sys.time()
df3 <-
    house_prices$trans$categ$rating_transform_for_selected3(
        data = combined_dataset,
        columns = house_prices$helpers$get_character_colnames(combined_dataset),
        ratings = ratings
    )
end_time <- Sys.time()
end_time - start_time


test_that("should be equal", {
    expect_equal(df1, df2)
})

test_that("should be equal", {
    expect_equal(df1, df3)
})

Time difference of 0.07696009 secs

Time difference of 0.9548082 secs

Time difference of 0.6622899 secs