### Imports

In [76]:
# Data Managements
import pandas as pd
import numpy as np
from pandas_datareader.data import DataReader
from ta import add_all_ta_features

# Statistics
from statsmodels.tsa.stattools import adfuller

# Preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# Supervised Learning
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import RandomForestRegressor

# Reporting
import matplotlib.pyplot as plt

### Data Ingestion


In [77]:
df = pd.read_csv('SydneyHousePrices.csv')
print(len(df))
df.head()

199504


Unnamed: 0,Date,Id,suburb,postalCode,sellPrice,bed,bath,car,propType
0,2019-06-19,1,Avalon Beach,2107,1210000,4.0,2,2.0,house
1,2019-06-13,2,Avalon Beach,2107,2250000,4.0,3,4.0,house
2,2019-06-07,3,Whale Beach,2107,2920000,3.0,3,2.0,house
3,2019-05-28,4,Avalon Beach,2107,1530000,3.0,1,2.0,house
4,2019-05-22,5,Whale Beach,2107,8000000,5.0,4,4.0,house


In [78]:
df.info()
# object data type is not supported by the model
# we need to convert it to numerical data type

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199504 entries, 0 to 199503
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Date        199504 non-null  object 
 1   Id          199504 non-null  int64  
 2   suburb      199504 non-null  object 
 3   postalCode  199504 non-null  int64  
 4   sellPrice   199504 non-null  int64  
 5   bed         199350 non-null  float64
 6   bath        199504 non-null  int64  
 7   car         181353 non-null  float64
 8   propType    199504 non-null  object 
dtypes: float64(2), int64(4), object(3)
memory usage: 13.7+ MB


### Feature Eng. - Common Tasks

##### Handle non-numerical Data

In [79]:
# convert suburb to numerical data type
unique_suburb = df['suburb'].unique()
print(len(unique_suburb))

# perform label encoding to assign a number to each suburb


# convert property_type to numerical data type
unique_propType = df['propType'].unique()
print(len(unique_propType))

# perform onehot encoding to assign a number to each propType



685
8


In [80]:
# Encoding

labelencoder = LabelEncoder()
encoded_suburbs = labelencoder.fit_transform(df['suburb'])
df['suburbs_encoded'] = encoded_suburbs 

# when onehot encoding, we need to drop first to avoid dummy variable trap
encoded = pd.get_dummies(df['propType'],prefix="pt_",drop_first=True)
df=df.join(encoded)


In [81]:
df.tail()

Unnamed: 0,Date,Id,suburb,postalCode,sellPrice,bed,bath,car,propType,suburbs_encoded,pt__duplex/semi-detached,pt__house,pt__other,pt__terrace,pt__townhouse,pt__villa,pt__warehouse
199499,2014-06-20,199500,Illawong,2234,1900000,5.0,3,7.0,house,318,0,1,0,0,0,0,0
199500,2014-05-26,199501,Illawong,2234,980000,4.0,3,2.0,house,318,0,1,0,0,0,0,0
199501,2014-04-17,199502,Alfords Point,2234,850000,4.0,2,2.0,house,5,0,1,0,0,0,0,0
199502,2013-09-07,199503,Illawong,2234,640000,3.0,2,2.0,townhouse,318,0,0,0,0,1,0,0
199503,2011-04-16,199504,Alfords Point,2234,1611000,5.0,4,3.0,house,5,0,1,0,0,0,0,0


##### Set Target

In [82]:
df['target'] = df['sellPrice']

In [83]:
# Data Cleaning

df_droped = df.copy()

df_droped.drop(columns=["Date","Id","suburb","sellPrice","propType"],inplace=True)

# check for null or inf values
isNull = df_droped.isnull().values.any()
isInf = df_droped.isin([np.inf, -np.inf]).values.any()
print(isNull,isInf) 

df_droped=df_droped.fillna(df_droped.mean())
isNull = df_droped.isnull().values.any()
isInf = df_droped.isin([np.inf, -np.inf]).values.any()
print(isNull,isInf) 

df_droped.head()

True False


False False


Unnamed: 0,postalCode,bed,bath,car,suburbs_encoded,pt__duplex/semi-detached,pt__house,pt__other,pt__terrace,pt__townhouse,pt__villa,pt__warehouse,target
0,2107,4.0,2,2.0,22,0,1,0,0,0,0,0,1210000
1,2107,4.0,3,4.0,22,0,1,0,0,0,0,0,2250000
2,2107,3.0,3,2.0,654,0,1,0,0,0,0,0,2920000
3,2107,3.0,1,2.0,22,0,1,0,0,0,0,0,1530000
4,2107,5.0,4,4.0,654,0,1,0,0,0,0,0,8000000


##### MinMax Scaling - Feature Scaling

In [84]:
df_scaling = df_droped.copy()

min_max_scaler = MinMaxScaler()
df_scaling.iloc[:] = min_max_scaler.fit_transform(df_scaling)
df_scaling.head()
# every entity is now between 0 and 1 respectively to its max and min value of the column

Unnamed: 0,postalCode,bed,bath,car,suburbs_encoded,pt__duplex/semi-detached,pt__house,pt__other,pt__terrace,pt__townhouse,pt__villa,pt__warehouse,target
0,0.037179,0.030612,0.010204,0.025,0.032164,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.000563
1,0.037179,0.030612,0.020408,0.075,0.032164,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.001048
2,0.037179,0.020408,0.020408,0.025,0.95614,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.00136
3,0.037179,0.020408,0.0,0.025,0.032164,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.000712
4,0.037179,0.040816,0.030612,0.075,0.95614,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.003725


### Train Test Split

In [85]:
# we will use the droped df for the model

is_deep_learning = False

df_tts = df_scaling.copy() if is_deep_learning else df_droped.copy()

df_tts.head()

Unnamed: 0,postalCode,bed,bath,car,suburbs_encoded,pt__duplex/semi-detached,pt__house,pt__other,pt__terrace,pt__townhouse,pt__villa,pt__warehouse,target
0,2107,4.0,2,2.0,22,0,1,0,0,0,0,0,1210000
1,2107,4.0,3,4.0,22,0,1,0,0,0,0,0,2250000
2,2107,3.0,3,2.0,654,0,1,0,0,0,0,0,2920000
3,2107,3.0,1,2.0,22,0,1,0,0,0,0,0,1530000
4,2107,5.0,4,4.0,654,0,1,0,0,0,0,0,8000000


In [90]:
# Split x and y data

X = df_tts.iloc[:,:-1].values
y = df_tts.iloc[:,-1].values    


In [98]:
# Train Test Split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1,shuffle=True)  
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(159603, 12) (39901, 12) (159603,) (39901,)


### ML - Lightweight Test

In [99]:
regressor = RandomForestRegressor(n_estimators=100,random_state=0,max_depth=10)
regressor.fit(X_train,y_train)

RandomForestRegressor(max_depth=10, random_state=0)

In [109]:
# make prediction on test set
y_pred = regressor.predict(X_test)
rounded_pred = [round(x,0) for x in y_pred]

print(f'Predictions : {rounded_pred[:10]}')
print(f'Reality : {y_test[:10]}')

Predictions : [602318.0, 2003988.0, 1107906.0, 1023853.0, 870998.0, 1430977.0, 904185.0, 1239137.0, 665817.0, 1382967.0]
Reality : [ 730000 1350100  860000 1390000  985000 1268000  480000 1230000  650000
 1700000]


In [110]:
# check accuracy

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(regressor, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')

In [117]:
# Report performance
print('MAE: %.3f --- std (%.3f)' % (abs(n_scores.mean()), n_scores.std()))



MAE: 395621.687 --- std (51625.585)
-395621.68710803875
