### Set up env

In [17]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# load data
data = pd.read_csv('data\melb_data.csv')
melb_predictors = data.drop(['Price'], axis=1)
# drop these non-numerical columns
X = melb_predictors.select_dtypes(exclude=['object'])
y = data.Price
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.8,test_size=0.2)

def score_dataset(train_X, test_X, train_y, test_y):
    model = RandomForestRegressor(random_state=0)
    model.fit(train_X,train_y)
    p_val = model.predict(test_X)
    mae = mean_absolute_error(test_y,p_val)
    return mae

### Apporach 1: Drop columns with missing values

In [19]:
missing_cols = [col for col in train_X.columns if train_X[col].isnull().any()]
reduced_train_X = train_X.drop(missing_cols,axis=1)
reduced_test_X = test_X.drop(missing_cols,axis=1)
score_dataset(reduced_train_X, reduced_test_X, train_y, test_y)

181465.20151319634

### Approach 2: Imputation

In [22]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer()
imputed_train_X = pd.DataFrame(imputer.fit_transform(train_X))
imputed_test_X = pd.DataFrame(imputer.fit_transform(test_X))

# Imputation removed column names; put them back
imputed_train_X.columns = train_X.columns
imputed_test_X.columns = test_X.columns

score_dataset(imputed_train_X,imputed_test_X,train_y,test_y)

178646.7845846483

### Approach 3: An Extension to Imputation

In [23]:
train_X_backup = train_X.copy()
test_X_backup = test_X.copy()

for col in missing_cols:
    train_X_backup[col+'_missing'] = train_X_backup[col].isnull()
    test_X_backup[col+'_missing'] = test_X_backup[col].isnull()

imputer = SimpleImputer()
imputed_train_X = pd.DataFrame(imputer.fit_transform(train_X_backup))
imputed_test_X = pd.DataFrame(imputer.fit_transform(test_X_backup))

# Imputation removed column names; put them back
imputed_train_X.columns = train_X_backup.columns
imputed_test_X.columns = test_X_backup.columns

score_dataset(imputed_train_X,imputed_test_X,train_y,test_y)

177862.38795076797

### Print the missing rows info

In [31]:
print(train_X.shape)
missing_val_count_by_column = train_X.isnull().sum()
print(missing_val_count_by_column[missing_val_count_by_column>0])

(10864, 12)
Car               48
BuildingArea    5215
YearBuilt       4349
dtype: int64
