In [31]:
getwd()
if ( is.null(environment()$this_notebook_dir) ) {
    this_notebook_dir <- getwd()
    setwd(paste0(getwd(), '/..'))
}
getwd()

In [33]:
source('helpers.R')
source('validation_utils.R')

kaggle.house$loadLibraries()
data <- kaggle.house$loadData()

df.training <- data$train
df.testing <- data$test
df.combined <- kaggle.house$getCombinedDataset(df.training, df.testing)

c(nrow(df.training),   nrow(df.testing),   nrow(df.combined))
c(length(df.training), length(df.testing), length(df.combined))

In [70]:
fix_missing <- list()

register_missing_fixer <- function (container, col_name, fixer, ...) {
    container[[col_name]] <- fixer(col_name, ...)
    container
}

register <- purrr::partial(register_missing_fixer, container = fix_missing)

In [35]:
replace_na_with_value <- function (col_name, value) {
    
    function (df) {
        df[df[, col_name] %>% `[[`(1) %>% is.na, col_name] <- value
        df
    }
}

replace_na_with_zero <- purrr::partial(replace_na_with_value, value = 0)

In [98]:
#fix_missing$BsmtFinSF1 <- replace_na_with_zero("BsmtFinSF1")
#fix_missing$BsmtFinSF2 <- replace_na_with_zero("BsmtFinSF2")
#fix_missing$BsmtFullBath <- replace_na_with_zero("BsmtFullBath")
#fix_missing$BsmtHalfBath <- replace_na_with_zero("BsmtHalfBath")
#fix_missing$BsmtUnfSF <- replace_na_with_zero("BsmtUnfSF")
# fix_missing$Electrical <- replace_na_with_value("Electrical", 'SBrkr')
# fix_missing$Exterior1st <- replace_na_with_value("Exterior1st", 'VinylSd')
# fix_missing$Exterior2nd <- replace_na_with_value("Exterior2nd", 'VinylSd')
# fix_missing$Functional <- replace_na_with_value("Functional", 'Typ')
# fix_missing$GarageArea <- replace_na_with_zero("GarageArea")
# fix_missing$GarageYrBlt <- replace_na_with_zero("GarageYrBlt")
# fix_missing$KitchenQual <- replace_na_with_value("KitchenQual", 'TA')
# fix_missing$LotFrontage <- replace_na_with_zero("LotFrontage")
# fix_missing$MasVnrArea <- replace_na_with_zero("MasVnrArea")

In [130]:
fix_missing <- register("BsmtFinSF1",   replace_na_with_zero)
fix_missing <- register("BsmtFinSF2",   replace_na_with_zero)
fix_missing <- register("BsmtFullBath", replace_na_with_zero)
fix_missing <- register("BsmtHalfBath", replace_na_with_zero)
fix_missing <- register("BsmtUnfSF",    replace_na_with_zero)
fix_missing <- register("Electrical",   replace_na_with_value, 'SBrkr')
fix_missing <- register("Exterior1st",  replace_na_with_value, 'VinylSd')
fix_missing <- register("Exterior2nd",  replace_na_with_value, 'VinylSd')
fix_missing <- register("Functional",   replace_na_with_value, 'Typ')
fix_missing <- register("GarageYrBlt",  replace_na_with_zero)
fix_missing <- register("BsmtUnfSF",    replace_na_with_zero)
fix_missing <- register("KitchenQual",  replace_na_with_value, 'TA')
fix_missing <- register("LotFrontage",  replace_na_with_zero)
fix_missing <- register("MasVnrArea",   replace_na_with_zero)

fix_missing$MasVnrType <- function (df) {
    
    df[is.na(df$MasVnrType) & 
       !is.na(df$MasVnrArea) & df$MasVnrArea > 0, "MasVnrType"] <- 'BrkFace'
    
    df[!is.na(df$MasVnrArea) & df$MasVnrArea == 0 &
       df$MasVnrType != 'None', "MasVnrType"] <- 'None'
    
    df[is.na(df$MasVnrType), "MasVnrType"] <- 'None'
    
    df
}

fix_missing$MasVnrArea <- function (df) {
    
    df[df$MasVnrType == 'None' & 
       !is.na(df$MasVnrArea) & df$MasVnrArea > 0, "MasVnrArea"] <- 0
    
    df[is.na(df$MasVnrArea), "MasVnrArea"] <- 0
    
    df
}

In [141]:
library(testthat)

test_that("MasVnrType & MasVnrArea", {
    df <- fix_missing$MasVnrType(df.combined)
    df <- fix_missing$MasVnrArea(df)
    
    expect_equal(df %>% filter(is.na(MasVnrType)) %>% nrow, 0)
    expect_equal(df %>% filter(is.na(MasVnrArea)) %>% nrow, 0)
    expect_equal(df %>% filter(MasVnrArea == 0 & MasVnrType != 'None') %>% nrow, 0)
    expect_equal(df %>% filter(MasVnrType == 'None' & MasVnrArea > 0) %>% nrow, 0)
})

In [119]:
df %>% filter(MasVnrArea == 0 & MasVnrType != 'None') %>% select(MasVnrArea, MasVnrType)

MasVnrArea,MasVnrType
0,BrkFace
0,Stone
0,BrkFace


In [102]:
#df.combined %>% (fix_missing$KitchenQual) %>% group_by(KitchenQual) %>% count
df.combined %>% group_by(MasVnrType) %>% count


MasVnrType,n
BrkCmn,25
BrkFace,879
,1742
Stone,249
,24


In [97]:
df.combined[is.na(df.combined$MasVnrType) & 
            !is.na(df.combined$MasVnrArea) & 
            df.combined$MasVnrArea > 0, "MasVnrType"]

MasVnrType
""


In [99]:
setdiff(df.combined %>% names, fix_missing %>% names) %>% sort

In [293]:
df.combined %>% filter(is.na(MasVnrType)) %>% filter(!is.na(MasVnrArea)) %>% select(MasVnrArea)

MasVnrArea
198


In [83]:
df.combined %>% filter(is.na(MasVnrArea)) %>% select(MasVnrArea, MasVnrType) %>% nrow

In [108]:
df = df.combined

df[df$MasVnrType == 'None' & !is.na(df$MasVnrArea) & df$MasVnrArea > 0, "MasVnrArea"]

MasVnrArea
288.0
1.0
1.0
344.0
312.0
285.0
1.0
""


In [100]:
df.combined %>% filter(MasVnrType == 'None' & MasVnrArea > 0) %>% select(MasVnrArea, MasVnrType)

MasVnrArea,MasVnrType
288,
1,
1,
344,
312,
285,
1,
