## Loading and analyzing the data

In [13]:
import pandas as pd

full_data = pd.read_csv('./RealEstate.csv')
full_data

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.79960,144.99840,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.80790,144.99340,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.80930,144.99440,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.79690,144.99690,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.80720,144.99410,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,Wheelers Hill,12 Strada Cr,4,h,1245000.0,S,Barry,26/08/2017,16.7,3150.0,...,2.0,2.0,652.0,,1981.0,,-37.90562,145.16761,South-Eastern Metropolitan,7392.0
13576,Williamstown,77 Merrett Dr,3,h,1031000.0,SP,Williams,26/08/2017,6.8,3016.0,...,2.0,2.0,333.0,133.0,1995.0,,-37.85927,144.87904,Western Metropolitan,6380.0
13577,Williamstown,83 Power St,3,h,1170000.0,S,Raine,26/08/2017,6.8,3016.0,...,2.0,4.0,436.0,,1997.0,,-37.85274,144.88738,Western Metropolitan,6380.0
13578,Williamstown,96 Verdon St,4,h,2500000.0,PI,Sweeney,26/08/2017,6.8,3016.0,...,1.0,5.0,866.0,157.0,1920.0,,-37.85908,144.89299,Western Metropolitan,6380.0


The goal is to predict prices so we will select column **"Price"** for our target. 
Rows that have a missing value in this column will be dropped.

We will also drop other columns from data:


In [14]:
full_data.dropna(axis='rows', subset=['Price'], inplace=True)

y = full_data['Price']
X = full_data.drop(['Price', 'Address', 'Date'], axis='columns')

> "Address" - is not of big importance for our model as it **does not provide enough relations** in the data. Address is a mostly unique description.

In [15]:
address_col = full_data['Address']
address_col.value_counts()

36 Aberfeldie St    3
2 Bruce St          3
5 Charles St        3
53 William St       3
14 Arthur St        3
                   ..
16 Alleford St      1
2/1073 Centre Rd    1
14 Columbia St      1
21 Hardy Ct         1
6 Agnes St          1
Name: Address, Length: 13378, dtype: int64

> "Date" - is not important for our model as it is most likely the date of listing the property.

In [16]:
date_col = full_data['Date']
date_col.describe()

count          13580
unique            58
top       27/05/2017
freq             473
Name: Date, dtype: object

***

## Additional preprocessing and creating a pipeline

Column "Suburb" is a bit special. It provides important information but cannot be encoded with OneHotEncoder due to high cardinality(it would make additional 314*13580 entries)

In [17]:
X['Suburb'].describe()

count         13580
unique          314
top       Reservoir
freq            359
Name: Suburb, dtype: object

Instead we will use OrdinalEncoder for this column only.

In [18]:
from sklearn.preprocessing import OrdinalEncoder

#transforming
oe = OrdinalEncoder()
X[['Suburb']] = oe.fit_transform(X[['Suburb']])

X.describe()

Unnamed: 0,Suburb,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,155.307953,2.937997,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,90.307844,0.955748,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,0.0,1.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,70.0,2.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,154.0,3.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,234.0,3.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,313.0,10.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


Selecting numerical and categorical columns

In [19]:
numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]
categorical_cols = [col for col in X.columns if (X[col].dtype == 'object')]

Preprocessing tools:
- SimpleImputer for missing numerical and categorical values
- OneHotEncoder for encoding categorical values

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

numerical_transformer = SimpleImputer(strategy = 'constant')

categorial_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorial_transformer, categorical_cols)
])

Choosing our model

In [21]:
from xgboost import XGBRegressor

model = XGBRegressor(random_state=0)

main = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

***

## GridSearchCV and finding optimal parameters for our model

Setting up

In [None]:
from sklearn.model_selection import cross_val_score, GridSearchCV
import numpy as np

params_grid = {
    'model__n_estimators': list(np.arange(100, 1100, 100)),
    'model__learning_rate': list(np.arange(0.01, 0.11, 0.01)),
    'preprocessor__num__strategy': ['mean', 'median', 'constant', 'most_frequent']
}

grid = GridSearchCV(main, params_grid, cv=5, scoring='r2')
grid.fit(X, y)
print(f'Grid: {grid.best_params_}')

***

## Updating parameters

In [23]:
numerical_transformer = SimpleImputer(strategy = grid.best_params_['preprocessor__num__strategy'])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorial_transformer, categorical_cols)
])

model = XGBRegressor(n_estimators=grid.best_params_['model__n_estimators'], learning_rate=grid.best_params_['model__learning_rate'], random_state=0)

main = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

scores = -1*cross_val_score(main, X, y, cv=10, scoring='neg_mean_absolute_error')

print(f'MAE Result: {scores.mean()}')

MAE Result: 167737.7767724825
