In [48]:
import pandas as pd 
import xgboost as xgb
import sklearn
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from math import sqrt

In [2]:
property_df = pd.read_csv("../data/curated/full_data.csv")

In [3]:
property_df.head(1)

Unnamed: 0,index,address,cost_text,Bed,Bath,Park,property_type,desc_head,LocID,longitude,latitude,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Median_tot_hhd_inc_weekly,total_population,SA2_NAME21,duration_mins,Offence Count,SA2_CODE_2021
0,https://www.domain.com.au/warrandyte-vic-3113-...,Warrandyte VIC 3113,875.0,5,3,3,House,mud brick magic,211021262,145.209992,-37.747366,930.0,481.0,3051.0,2779.0,8781.0,Warrandyte - Wonga Park,>50.0,859.0,211021262.0


In [4]:
y = property_df['cost_text'].tolist()

In [7]:
X = property_df.drop(["index", "address", "desc_head", "latitude", "longitude", "LocID", "SA2_CODE_2021"], axis=1)

In [14]:
# encode categorical variables
onehot_columns = ['property_type', 'SA2_NAME21', 'duration_mins']
onehot_df = X[onehot_columns]
onehot_df = pd.get_dummies(onehot_df, columns = onehot_columns)
score_onehot_drop = X.drop(onehot_columns, axis = 1)
score_onehot = pd.concat([score_onehot_drop, onehot_df], axis = 1)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(score_onehot, y, test_size=0.2, random_state=1)

#### XGBoosting

In [19]:
params = { 'max_depth': [3,6,10],
           'gamma': [0, 0.25, 0.5],
           'learning_rate': [0.01, 0.05, 0.1, 0.2],
           'n_estimators': [100, 250, 500, 1000],
           'colsample_bytree': [0.3, 0.7]}

xgbr = xgb.XGBRegressor(random_state=1)

clf = GridSearchCV(estimator=xgbr, 
                   param_grid=params,
                   scoring='neg_mean_squared_error', 
                   verbose=1,
                   return_train_score=True)

In [20]:
clf.fit(X_train, y_train)   # takes 165mins to run

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


In [21]:
print("Best parameters:", clf.best_params_)

Best parameters: {'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 1000}


According to our GridSearch, the parameters we should use are: {'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 1000}

#### Testing

In [49]:
y_pred = clf.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
print('R-Squared:', r2)
print ('R Squared:', rmse)
print ('MAE:', mae)

R-Squared: 0.9972014727686229
R Squared: 13.199916492786993
MAE: 1.4561957417684275
