In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [8]:
melbourn_data = pd.read_csv('melb_data.csv')
target = melbourn_data.Price
melbourn_data_prediction = melbourn_data.drop(['Price'], axis =1)
# To keep things simple, we will usel only numerical features
melbourn_data_prediction = melbourn_data_prediction.select_dtypes(exclude = ['object'])

In [10]:
melbourn_data_prediction.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
0,2,2.5,3067.0,2.0,1.0,1.0,202.0,,,-37.7996,144.9984,4019.0
1,2,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,-37.8079,144.9934,4019.0
2,3,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,-37.8093,144.9944,4019.0
3,3,2.5,3067.0,3.0,2.0,1.0,94.0,,,-37.7969,144.9969,4019.0
4,4,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,-37.8072,144.9941,4019.0


In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(melbourn_data_prediction, target, test_size = 0.3, random_state = 101)

In [19]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different methods
def score_dataset(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor(n_estimators = 10, random_state= 101)
    model.fit(X_train, y_train)
    predicted_test = model.predict(X_test)
    return mean_absolute_error(predicted_test, y_test)

In [20]:
# Get the name of columns contain mising values
col_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]
reduced_X_train = X_train.drop(col_with_missing, axis =1)
reduced_X_test = X_test.drop(col_with_missing, axis =1)

print('MAE for dropping any features contain missing value')
print(score_dataset(reduced_X_train, reduced_X_test, Y_train, Y_test))

MAE for dropping any features contain missing value
185200.12072024686


In [25]:
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer(strategy = 'median')
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_test = pd.DataFrame(my_imputer.fit_transform(X_test))

# Imputation removes the name columns
imputed_X_train.columns = X_train.columns
imputed_X_test.columns = X_test.columns

print('MAE for imputed dataset using Simple imputation')
print(score_dataset(imputed_X_train, imputed_X_test, Y_train, Y_test))

MAE for imputed dataset using Simple imputation
179837.1635657012


In [26]:
# Make copy to avoid changing on original data
X_train_copy = X_train.copy()
X_test_copy = X_test.copy()

In [32]:
# Make new column to show missing values in the features
for col in col_with_missing:
    X_train_copy[col + '_was_missing'] = X_train_copy[col].isnull()
    X_test_copy[col + '_was_missing'] = X_test_copy[col].isnull()

# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train_copy))
imputed_X_test = pd.DataFrame(my_imputer.fit_transform(X_test_copy))

# Imputation removes the name columns
imputed_X_train.columns = X_train_copy.columns
imputed_X_test.columns = X_test_copy.columns

print('MAE for imputed dataset using Simple imputation')
print(score_dataset(imputed_X_train, imputed_X_test, Y_train, Y_test))

MAE for imputed dataset using Simple imputation
194718.76918671248


AttributeError: 'list' object has no attribute 'isnull'