Predicting Housing Prices in Melbourne

In [1]:
#python imports
import pandas as pd
import os
import numpy as np
from matplotlib import pyplot as plt 
from xgboost import XGBRegressor

#sklearn imports
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score, train_test_split, RepeatedKFold
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline, Pipeline
import seaborn as sns


In [2]:
DATA_URL = "../input/housing/melb_housing_data.csv"
                        
def wrangle(data_url = DATA_URL):
    df = pd.read_csv(data_url)

    #remove high dimensionality feature
    df.drop(columns=["Address"], inplace=True)
    return df

data = wrangle()
data.head(2)

Unnamed: 0,Suburb,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,2,h,1480000,S,Biggin,03/12/2016,2.5,3067,2,1,1.0,202,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019
1,Abbotsford,2,h,1035000,S,Biggin,04/02/2016,2.5,3067,2,1,0.0,156,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019


In [3]:
#specify the feature matrix and target vector
#feature matrix
X = data.copy()

#target vector
y= X.pop("Price")


Process(transform) the Data with pipelines

In [4]:
#Pipeline that transforms numerical data by filling missing data with mean of the column
# and also applies standard scaler such that the data has uniform scale
num_pipeline = make_pipeline(
    SimpleImputer(strategy= 'mean'),
    StandardScaler()
)

#Pipeline transforms categorical data by
#filling the missing data with most frequent value
#and applys one hot encoder to it
cat_pipeline = make_pipeline(
    OneHotEncoder(handle_unknown='ignore'),
    SimpleImputer(strategy="most_frequent")
)

#preprocessing full pipeline 
preprocessor = make_column_transformer( 
    ( num_pipeline, make_column_selector(dtype_include= np.number)),
    (cat_pipeline, make_column_selector(dtype_exclude= np.number))
)

In [5]:
#baseline model
baseline_predictions = [y.mean()]*len(y)
baseline_prediction_score = mean_absolute_error(y, baseline_predictions)
baseline_prediction_score

462213.67024240794

In [6]:
#Define prospective models
random_forest_reg = make_pipeline(preprocessor, RandomForestRegressor())
ridge_regressor = make_pipeline(preprocessor, Ridge())
xgboost_reg = make_pipeline(preprocessor, XGBRegressor(
    n_estimators =500,
    eta = 0.01
))


Evaluate model

In [7]:
#Evaluate the model by obtaining mean squared error metric
cv = RepeatedKFold(n_splits= 5, n_repeats= 3) #cross validation configuration
pred_scores = cross_val_score(xgboost_reg, X, y, scoring="neg_mean_absolute_error", cv=cv)

pd.Series(pred_scores).abs().describe().round(2)


count        15.00
mean     169653.23
std        2214.50
min      166134.07
25%      168560.52
50%      169236.06
75%      170518.33
max      174057.93
dtype: float64