In [2]:
setwd(paste0(Sys.getenv('R_SOURCES'), '/house_prices'))
source('main.R')
house_prices$helpers$import_libs()
combined_dataset <- house_prices$helpers$load_data()
combined_dataset %>% dim

In [3]:
# testing replace_with_most_common
#

fix_by_most_common <- combined_dataset %>% (house_prices$missing$categ$replace_with_most_common)

test_that("numeric columns should not be affected", {
    expect_equal(
        combined_dataset %>% select_if(is.numeric),
        fix_by_most_common %>% select_if(is.numeric)
    )
})

test_that("categ columns with `good` NAs should not be affected", {
    expect_equal(
        combined_dataset %>% select(one_of(house_prices$missing$colums_with_valid_na)),
        fix_by_most_common %>% select(one_of(house_prices$missing$colums_with_valid_na))
    )
})

test_that("there should not be NAs in processed columns", {
    
    affected_columns <- 
        setdiff(
            house_prices$helpers$get_character_colnames(combined_dataset),
            house_prices$missing$colums_with_valid_na
        )
    
    expect_equal(
        sum(is.na(fix_by_most_common %>% select(one_of(affected_columns)))),
        0
    )
})

In [4]:
# testing fix_valid
#

fix_valid <- combined_dataset %>% (house_prices$missing$categ$fix_valid)

test_that("numeric columns should not be affected", {
    expect_equal(
        combined_dataset %>% select_if(is.numeric),
        fix_valid %>% select_if(is.numeric)
    )
})

test_that("categ columns with `bad` NAs should not be affected", {
    
     bad_columns <- 
        setdiff(
            house_prices$helpers$get_character_colnames(combined_dataset),
            house_prices$missing$colums_with_valid_na
        )
    
    expect_equal(
        combined_dataset %>% select(one_of(bad_columns)),
        fix_valid %>% select(one_of(bad_columns))
    )
})

test_that("there should not be NAs in processed columns", {
    
    expect_equal(
        sum(is.na(fix_valid %>% select(one_of(house_prices$missing$colums_with_valid_na)))),
        0
    )
})

In [5]:
# testing replace_with_zero
#

fix_by_zero <- combined_dataset %>% (house_prices$missing$numeric$replace_with_zero)

test_that("categ columns should not be affected", {
    expect_equal(
        combined_dataset %>% select_if(is.character),
        fix_by_zero %>% select_if(is.character)
    )
})

test_that("SalePrice should not be affected", {
    
    expect_equal(
        combined_dataset['SalePrice'],
        fix_by_zero['SalePrice']
    )
})

test_that("there should not be NAs in processed columns", {
    
    expect_equal(
        sum(is.na(fix_by_zero %>% select_if(is.numeric) %>% select(-SalePrice))),
        0
    )
})

In [7]:
source('main.R')
# testing all together
#
fixed <- house_prices$missing$fix_all(combined_dataset)

test_that("should be no NA values except SalePrice column", {
    expect_equal(sum(is.na(fixed %>% select(-SalePrice))), 0)
})

test_that("SalePrice should not be affected", {
    
    expect_equal(
        combined_dataset['SalePrice'],
        fix_by_zero['SalePrice']
    )
})