In [1]:
getwd()
if ( is.null(environment()$this_notebook_dir) ) {
    this_notebook_dir <- getwd()
    setwd(paste0(getwd(), '/..'))
}
getwd()

In [3]:
source('helpers.R')
source('validation_utils.R')

kaggle.house$loadLibraries()
data <- kaggle.house$loadData()

df.training <- data$train
df.testing <- data$test
df.combined <- kaggle.house$getCombinedDataset(df.training, df.testing)

c(nrow(df.training),   nrow(df.testing),   nrow(df.combined))
c(length(df.training), length(df.testing), length(df.combined))

In [17]:
register_missing_fixer <- function (container, col_name, fixer, ...) {
    container[[col_name]] <- fixer(col_name, ...)
    container
}

fix_missing <- list()

register <- purrr::partial(register_missing_fixer, container = fix_missing)

In [18]:
replace_na_with_value <- function (col_name, value) {
    
    function (df) {
        df[df[, col_name] %>% `[[`(1) %>% is.na, col_name] <- value
        df
    }
}

replace_na_with_zero <- purrr::partial(replace_na_with_value, value = 0)

In [19]:
fix_missing <- register("BsmtFinSF1",   replace_na_with_zero)
fix_missing <- register("BsmtFinSF2",   replace_na_with_zero)
fix_missing <- register("BsmtFullBath", replace_na_with_zero)
fix_missing <- register("BsmtHalfBath", replace_na_with_zero)
fix_missing <- register("BsmtUnfSF",    replace_na_with_zero)
fix_missing <- register("Electrical",   replace_na_with_value, 'SBrkr')
fix_missing <- register("Exterior1st",  replace_na_with_value, 'VinylSd')
fix_missing <- register("Exterior2nd",  replace_na_with_value, 'VinylSd')
fix_missing <- register("Functional",   replace_na_with_value, 'Typ')
fix_missing <- register("GarageYrBlt",  replace_na_with_zero)
fix_missing <- register("BsmtUnfSF",    replace_na_with_zero)
fix_missing <- register("KitchenQual",  replace_na_with_value, 'TA')
fix_missing <- register("LotFrontage",  replace_na_with_zero)

fix_missing$MasVnrArea <- function (df) {
    df[df$MasVnrType == 'None' & 
       !is.na(df$MasVnrArea) & df$MasVnrArea > 0, "MasVnrArea"] <- 0
    
    df[is.na(df$MasVnrArea), "MasVnrArea"] <- 0
    
    df
}

fix_missing$MasVnrType <- function (df) {
    df[is.na(df$MasVnrType) & 
       !is.na(df$MasVnrArea) & df$MasVnrArea > 0, "MasVnrType"] <- 'BrkFace'
    
    df[!is.na(df$MasVnrArea) & df$MasVnrArea == 0 &
       df$MasVnrType != 'None', "MasVnrType"] <- 'None'
    
    df[is.na(df$MasVnrType), "MasVnrType"] <- 'None'
    
    df
}

fix_missing <- register("MSZoning",  replace_na_with_value, 'RL')
fix_missing <- register("SaleType",  replace_na_with_value, 'Oth')

fix_missing$TotalBsmtSF <- function (df) {
    df[is.na(df$TotalBsmtSF) & is.na(df$BsmtCond), "TotalBsmtSF"] <- 0
    df
}

In [15]:
test_that("BsmtFinSF1", {
    df <- fix_missing$BsmtFinSF1(df.combined)
    
    expect_equal(df %>% filter(is.na(BsmtFinSF1)) %>% nrow, 0)

})

test_that("MasVnrType & MasVnrArea", {
    df <- fix_missing$MasVnrType(df.combined)
    df <- fix_missing$MasVnrArea(df)
    
    expect_equal(df %>% filter(is.na(MasVnrType)) %>% nrow, 0)
    expect_equal(df %>% filter(is.na(MasVnrArea)) %>% nrow, 0)
    expect_equal(df %>% filter(MasVnrArea == 0 & MasVnrType != 'None') %>% nrow, 0)
    expect_equal(df %>% filter(MasVnrType == 'None' & MasVnrArea > 0) %>% nrow, 0)
})

test_that("MSZoning", {
    df <- fix_missing$MSZoning(df.combined)
    
    expect_equal(df %>% filter(is.na(MSZoning)) %>% nrow, 0)

})

test_that("TotalBsmtSF", {
    df <- fix_missing$TotalBsmtSF(df.combined)
    
    expect_equal(df %>% filter(is.na(TotalBsmtSF)) %>% nrow, 0)
    expect_equal(df %>% filter(TotalBsmtSF == 0 & !is.na(BsmtCond)) %>% nrow, 0)

})

In [20]:
setdiff(df.combined %>% names, fix_missing %>% names) %>% sort