In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%load_ext rpy2.ipython

In [4]:
import numpy as np
import pandas as pd

from kaggle.house_prices import helpers
from kaggle.house_prices import missing

combined_dataset = helpers.load_data()
combined_dataset.shape

(2919, 82)

In [28]:
# testing replace_with_most_common
#
fix_by_most_common = missing.replace_with_most_common(combined_dataset)

# numeric columns should not be affected
pd.testing.assert_frame_equal(
    fix_by_most_common.select_dtypes(include=[np.number]).sort_index(axis=1), 
    combined_dataset.select_dtypes(include=[np.number]).sort_index(axis=1)
)
 
# categ columns with `good` NAs should not be affected
pd.testing.assert_frame_equal(
    fix_by_most_common[missing.colums_with_valid_na].sort_index(axis=1), 
    combined_dataset[missing.colums_with_valid_na].sort_index(axis=1)
)

# there should not be NAs in processed columns
affected_columns = list(
    set(helpers.get_character_colnames(combined_dataset)) -  
    set(missing.colums_with_valid_na)
)
assert fix_by_most_common[affected_columns].isnull().values.sum() == 0

In [29]:
# testing fix_valid
#
fix_valid = missing.fix_valid(combined_dataset)

# numeric columns should not be affected
pd.testing.assert_frame_equal(
    fix_valid.select_dtypes(include=[np.number]).sort_index(axis=1), 
    combined_dataset.select_dtypes(include=[np.number]).sort_index(axis=1)
)
 
# categ columns with `bad` NAs should not be affected
bad_columns = list(
    set(helpers.get_character_colnames(combined_dataset)) -  
    set(missing.colums_with_valid_na)
)
pd.testing.assert_frame_equal(
    fix_valid[bad_columns].sort_index(axis=1), 
    combined_dataset[bad_columns].sort_index(axis=1)
)

# there should not be NAs in processed columns
assert fix_valid[missing.colums_with_valid_na].isnull().values.sum() == 0

In [30]:
# testing replace_with_zero
#
fix_by_zero = missing.replace_with_zero(combined_dataset)

# categ columns should not be affected
pd.testing.assert_frame_equal(
    fix_by_zero.select_dtypes(exclude=[np.number]).sort_index(axis=1), 
    combined_dataset.select_dtypes(exclude=[np.number]).sort_index(axis=1)
)

# SalePrice should not be affected
pd.testing.assert_series_equal(
    fix_by_zero['SalePrice'],
    combined_dataset['SalePrice']
)

# there should not be NAs in processed columns
assert fix_by_zero.select_dtypes(include=[np.number]).drop(['SalePrice'], axis=1).isnull().values.sum() == 0

In [31]:
# testing all together
#
fixed = missing.fix_all(combined_dataset)

# should be no NA values except SalePrice column
assert fixed.drop(['SalePrice'], axis=1).isnull().values.sum() == 0

# SalePrice should not be affected
pd.testing.assert_series_equal(
    fixed['SalePrice'],
    combined_dataset['SalePrice']
)