# Approcahes to deal with missing values
> 1. `Drop column`

> 2. `Imputation`

> 3. `Extension to Imputation`

In [18]:
# example usage
import pandas as pd
from sklearn.model_selection import train_test_split

# load data
data = pd.read_csv('melb_data.csv')

# select target
y = data.Price

# numerical predicators
melb_predictors = data.drop(['Price'], axis = 1) 
X = melb_predictors.select_dtypes(exclude=['object'])

# divide it into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size = 0.8, 
                                                      test_size = 0.2, 
                                                      random_state = 0)


In [13]:
# defining a function

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# function to compare diff approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators = 10, random_state = 0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)
    


### Score from approach_1- `Drop column`


In [15]:
# get names of missing columns
cols_missing = [col for col in X_train.columns
                if X_train[col].isnull().any()]

# drop them
reduced_X_train = X_train.drop(cols_missing, axis = 1)
reduced_X_valid = X_valid.drop(cols_missing, axis = 1)

print("MAE from Approch_1 (after having dropped the column values): ")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))
      


MAE from Approch_1 (after having dropped the column values): 
183550.22137772635


### Score from appraoch_2- `Imputation`

In [17]:
from sklearn.impute import SimpleImputer

# imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# imputation removed
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

print("MAE from Approach 2 (imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

                               

MAE from Approach 2 (imputation):
178166.46269899711


### Score from approach_3 - `Extension to imputation`

In [20]:
# make a copy
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

# make new columns indicating what will be imputed
for col in cols_missing:
    X_train_plus[col + 'was_missing']= X_train_plus[col].isnull()
    X_valid_plus[col + 'was_missing'] = X_valid_plus[col].isnull()

# imputation
my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))

# imputation removed column names: put them back
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_valid_plus.columns = X_valid_plus.columns

print("MAE from Approach 3 (an extension to imputation):")
print(score_dataset(imputed_X_train_plus, imputed_X_valid_plus, y_train, y_valid))

MAE from Approach 3 (an extension to imputation):
178927.503183954


### Inference
`Imputaion` performs better than dropping the columns because dropping removed lot of useful information. 

In [23]:
# shape of data
print(X_train.shape)

# number of missing values
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

(10864, 12)
Car               49
BuildingArea    5156
YearBuilt       4307
dtype: int64
