In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Load the data
df = pd.read_csv("./data/melb_data.csv")

In [6]:
# Select target
y = df.Price

# to keep things simple, i'll use only numerical predictors
melb_predictors = df.drop(["Price"], axis=1)
X = melb_predictors.select_dtypes(exclude=["object"])
X.dtypes

Rooms              int64
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Car              float64
Landsize         float64
BuildingArea     float64
YearBuilt        float64
Lattitude        float64
Longtitude       float64
Propertycount    float64
dtype: object

In [7]:
# Divide data into training and validation subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
X_train.head(5)

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
12167,1,5.0,3182.0,1.0,1.0,1.0,0.0,,1940.0,-37.85984,144.9867,13240.0
6524,2,8.0,3016.0,2.0,2.0,1.0,193.0,,,-37.858,144.9005,6380.0
8413,3,12.6,3020.0,3.0,1.0,1.0,555.0,,,-37.7988,144.822,3755.0
2919,3,13.0,3046.0,3.0,1.0,1.0,265.0,,1995.0,-37.7083,144.9158,8870.0
6043,3,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,144.8272,4217.0


### Define Function to measure quality of each approach

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return mean_absolute_error(y_test, preds)

#### Score from approach 1: (Drop Columns with missing values) 

In [12]:
X_train.isnull().sum()

Rooms               0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                49
Landsize            0
BuildingArea     5156
YearBuilt        4307
Lattitude           0
Longtitude          0
Propertycount       0
dtype: int64

In [10]:
# Get names of columns with missing values
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]

print(cols_with_missing)
# Drop columns with missing data in training data and validation data
dropped_X_train = X_train.drop(cols_with_missing, axis=1)
dropped_X_test = X_test.drop(cols_with_missing, axis=1)

mae = score_dataset(dropped_X_train, dropped_X_test, y_train, y_test)
print("MAE from Approach 1: {}".format(round(mae,2)))

['Car', 'BuildingArea', 'YearBuilt']
MAE from Approach 1: 183550.22


#### Score from Approach 2: (Imputation) 

In [17]:
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer(strategy='median') # Áp dụng tham số median cho kết quả tốt hơn mean
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_test = pd.DataFrame(my_imputer.fit_transform(X_test))

imputed_X_train.columns = X_train.columns
imputed_X_test.columns = X_test.columns

mae = score_dataset(imputed_X_train, imputed_X_test, y_train, y_test)
print("MAE from Approach 2: {}".format(round(mae, 2)))
print("\n")
imputed_X_train.head()

MAE from Approach 2: 178045.24




Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
0,1.0,5.0,3182.0,1.0,1.0,1.0,0.0,126.0,1940.0,-37.85984,144.9867,13240.0
1,2.0,8.0,3016.0,2.0,2.0,1.0,193.0,126.0,1970.0,-37.858,144.9005,6380.0
2,3.0,12.6,3020.0,3.0,1.0,1.0,555.0,126.0,1970.0,-37.7988,144.822,3755.0
3,3.0,13.0,3046.0,3.0,1.0,1.0,265.0,126.0,1995.0,-37.7083,144.9158,8870.0
4,3.0,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,144.8272,4217.0


#### Score from Approach 3: (An Extension to imputation) 

In [18]:
# make a copy to avoid changing original data (when imputing)
X_train_plus = X_train.copy()
X_test_plus = X_test.copy()

# make new columns indicating what will be imputed
for col in cols_with_missing:
    X_train_plus[col + "_was_missing"] = X_train_plus[col].isnull()
    X_test_plus[col + "_was_missing"] = X_test_plus[col].isnull()
    
X_train_plus.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount,Car_was_missing,BuildingArea_was_missing,YearBuilt_was_missing
12167,1,5.0,3182.0,1.0,1.0,1.0,0.0,,1940.0,-37.85984,144.9867,13240.0,False,True,False
6524,2,8.0,3016.0,2.0,2.0,1.0,193.0,,,-37.858,144.9005,6380.0,False,True,True
8413,3,12.6,3020.0,3.0,1.0,1.0,555.0,,,-37.7988,144.822,3755.0,False,True,True
2919,3,13.0,3046.0,3.0,1.0,1.0,265.0,,1995.0,-37.7083,144.9158,8870.0,False,True,False
6043,3,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,144.8272,4217.0,False,False,False


In [19]:
# Imputation
my_imputer = SimpleImputer(strategy='mean')

imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_test_plus = pd.DataFrame(my_imputer.transform(X_test_plus))

# Imputation removed column names, put them back
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_test_plus.columns = X_test_plus.columns

mae = score_dataset(imputed_X_train_plus, imputed_X_test_plus, y_train, y_test)
print("MAE from Approach 3: {}".format(round(mae, 2)))
print("\n")
imputed_X_train_plus.head()

MAE from Approach 3: 178927.5




Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount,Car_was_missing,BuildingArea_was_missing,YearBuilt_was_missing
0,1.0,5.0,3182.0,1.0,1.0,1.0,0.0,153.764119,1940.0,-37.85984,144.9867,13240.0,0.0,1.0,0.0
1,2.0,8.0,3016.0,2.0,2.0,1.0,193.0,153.764119,1964.839866,-37.858,144.9005,6380.0,0.0,1.0,1.0
2,3.0,12.6,3020.0,3.0,1.0,1.0,555.0,153.764119,1964.839866,-37.7988,144.822,3755.0,0.0,1.0,1.0
3,3.0,13.0,3046.0,3.0,1.0,1.0,265.0,153.764119,1995.0,-37.7083,144.9158,8870.0,0.0,1.0,0.0
4,3.0,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,144.8272,4217.0,0.0,0.0,0.0
