In [2]:
import pandas as pd

In [6]:
data = pd.read_csv(".\\data\\melbourne-housing-snapshot\\melb_data.csv")
data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [10]:
data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [36]:
X = data[["Rooms", "Distance", "Landsize", "BuildingArea", "YearBuilt"]]
# X = data.drop(["Price"], axis=1)
X.head()

Unnamed: 0,Rooms,Distance,Landsize,BuildingArea,YearBuilt
0,2,2.5,202.0,,
1,2,2.5,156.0,79.0,1900.0
2,3,2.5,134.0,150.0,1900.0
3,3,2.5,94.0,,
4,4,2.5,120.0,142.0,2014.0


In [37]:
y = data.Price
y.head()

0    1480000.0
1    1035000.0
2    1465000.0
3     850000.0
4    1600000.0
Name: Price, dtype: float64

In [38]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [39]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train, y_train, X_valid, y_valid):
    model = RandomForestRegressor(random_state=1)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)


In [40]:
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]
cols_with_missing

['BuildingArea', 'YearBuilt']

In [41]:
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_test = X_test.drop(cols_with_missing, axis=1)

In [42]:
print("MAE (reduced dataset):")
print(score_dataset(reduced_X_train, y_train, reduced_X_test, y_test))

MAE (reduced dataset):
274136.32463014097


In [47]:
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
# imputed_X_train
imputed_X_train.columns = X_train.columns

imputed_X_test = pd.DataFrame(my_imputer.transform(X_test))
imputed_X_test.columns = X_test.columns
imputed_X_test

Unnamed: 0,Rooms,Distance,Landsize,BuildingArea,YearBuilt
0,4.0,8.0,450.0,190.000000,1910.000000
1,2.0,6.6,172.0,81.000000,1900.000000
2,3.0,10.5,581.0,153.764119,1964.839866
3,3.0,4.5,128.0,134.000000,2000.000000
4,3.0,8.5,480.0,153.764119,1964.839866
...,...,...,...,...,...
2711,2.0,6.4,47.0,35.000000,2013.000000
2712,4.0,8.0,551.0,153.764119,1964.839866
2713,3.0,10.8,757.0,153.764119,1964.839866
2714,4.0,6.2,478.0,152.000000,1925.000000


In [48]:
print("MAE (imputed dataset):")
print(score_dataset(imputed_X_train, y_train, imputed_X_test, y_test))

MAE (imputed dataset):
250774.06652053812


In [53]:
X_train_plus = X_train.copy()
X_test_plus = X_test.copy()

In [60]:
for col in cols_with_missing:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()

X_train_plus.head()

Unnamed: 0,Rooms,Distance,Landsize,BuildingArea,YearBuilt,BuildingArea_was_missing,YearBuilt_was_missing
12167,1,5.0,0.0,,1940.0,True,False
6524,2,8.0,193.0,,,True,True
8413,3,12.6,555.0,,,True,True
2919,3,13.0,265.0,,1995.0,True,False
6043,3,13.3,673.0,673.0,1970.0,False,False


In [61]:
X_train_plus_imputed = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
X_train_plus_imputed.columns = X_train_plus.columns
X_train_plus_imputed.head()


Unnamed: 0,Rooms,Distance,Landsize,BuildingArea,YearBuilt,BuildingArea_was_missing,YearBuilt_was_missing
0,1.0,5.0,0.0,153.764119,1940.0,1.0,0.0
1,2.0,8.0,193.0,153.764119,1964.839866,1.0,1.0
2,3.0,12.6,555.0,153.764119,1964.839866,1.0,1.0
3,3.0,13.0,265.0,153.764119,1995.0,1.0,0.0
4,3.0,13.3,673.0,673.0,1970.0,0.0,0.0


In [62]:
for col in cols_with_missing:
    X_test_plus[col + '_was_missing'] = X_test_plus[col].isnull()

X_test_plus.head()

Unnamed: 0,Rooms,Distance,Landsize,BuildingArea,YearBuilt,BuildingArea_was_missing,YearBuilt_was_missing
8505,4,8.0,450.0,190.0,1910.0,False,False
5523,2,6.6,172.0,81.0,1900.0,False,False
12852,3,10.5,581.0,,,True,True
4818,3,4.5,128.0,134.0,2000.0,False,False
12812,3,8.5,480.0,,,True,True


In [63]:
my_imputer = SimpleImputer()

X_test_plus_imputed = pd.DataFrame(my_imputer.fit_transform(X_test_plus))
X_test_plus_imputed.columns = X_test_plus.columns
X_test_plus_imputed.head()

Unnamed: 0,Rooms,Distance,Landsize,BuildingArea,YearBuilt,BuildingArea_was_missing,YearBuilt_was_missing
0,4.0,8.0,450.0,190.0,1910.0,0.0,0.0
1,2.0,6.6,172.0,81.0,1900.0,0.0,0.0
2,3.0,10.5,581.0,144.756506,1964.064927,1.0,1.0
3,3.0,4.5,128.0,134.0,2000.0,0.0,0.0
4,3.0,8.5,480.0,144.756506,1964.064927,1.0,1.0


In [64]:
print("MAE (plusimputed dataset):")
print(score_dataset(X_train_plus_imputed, y_train, X_test_plus_imputed, y_test))

MAE (plusimputed dataset):
248738.35946278422


In [65]:
X_train.shape

(10864, 5)

In [73]:
X_train.isnull().sum()

Rooms              0
Distance           0
Landsize           0
BuildingArea    5156
YearBuilt       4307
dtype: int64