## Load the Dataset

In [1]:
# Import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn import model_selection
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
import pickle
import shap

pd.set_option('display.float_format', lambda x: '%.4f' % x)

  def _pt_shuffle_rec(i, indexes, index_mask, partition_tree, M, pos):
  def delta_minimization_order(all_masks, max_swap_size=100, num_passes=2):
  def _reverse_window(order, start, length):
  def _reverse_window_score_gain(masks, order, start, length):
  def _mask_delta_score(m1, m2):
  def identity(x):
  def _identity_inverse(x):
  def logit(x):
  def _logit_inverse(x):
  def _build_fixed_single_output(averaged_outs, last_outs, outputs, batch_positions, varying_rows, num_varying_rows, link, linearizing_weights):
  def _build_fixed_multi_output(averaged_outs, last_outs, outputs, batch_positions, varying_rows, num_varying_rows, link, linearizing_weights):
  def _init_masks(cluster_matrix, M, indices_row_pos, indptr):
  def _rec_fill_masks(cluster_matrix, indices_row_pos, indptr, indices, M, ind):
  def _single_delta_mask(dind, masked_inputs, last_mask, data, x, noop_code):
  def _delta_masking(masks, x, curr_delta_inds, varying_rows_out,
  def _jit_build_partition_tree(xmin, xmax, ymi

In [2]:
# Load the dataset

df = pd.read_csv('Cleaned_Data.csv')
df.head()

Unnamed: 0,Location,Price,Rooms,Bathrooms,Property Type,Size,Furnishing,Store Rooms,Places of Worship,Schools,Hospitals,Malls,Restaurants
0,KLCC,1250000.0,2,3.0,Serviced Residence,1335.0,Fully Furnished,1,53.0,24.0,20.0,5.0,818.0
1,KLCC,2400000.0,2,2.0,Serviced Residence,1006.0,Fully Furnished,0,53.0,24.0,20.0,5.0,818.0
2,KLCC,3600000.0,4,4.0,Serviced Residence,3897.0,Partly Furnished,0,53.0,24.0,20.0,5.0,818.0
3,KLCC,4280000.0,3,4.0,Serviced Residence,2195.0,Fully Furnished,1,53.0,24.0,20.0,5.0,818.0
4,KLCC,2300000.0,1,2.0,Serviced Residence,1023.0,Partly Furnished,0,53.0,24.0,20.0,5.0,818.0


## Feature Extraction

### Encoding

In [3]:
# Select categorical columns

categorical = df.select_dtypes(exclude=[np.number]).columns
print (categorical)

Index(['Location', 'Property Type', 'Furnishing'], dtype='object')


In [4]:
# Encode Location

le_location = LabelEncoder()
df['Location'] = le_location.fit_transform(df['Location'])

In [5]:
# Encode Property Type

le_proptype = LabelEncoder()
df['Property Type'] = le_proptype.fit_transform(df['Property Type'])

In [6]:
# Encode Furnishing

le_furnishing = LabelEncoder()
df['Furnishing'] = le_furnishing.fit_transform(df['Furnishing'])

### Split and Scale the Dataset

In [7]:
# Shuffle rows

df = shuffle(df, random_state=42)

In [8]:
# Split dataset into features and outcome

DV = 'Price'
x = df.drop(DV, axis=1)
y = df[DV]

In [9]:
# Split into train and test

x_train, x_test, y_train, y_test = train_test_split (x, y, test_size=0.20,
random_state=42)

In [10]:
# Select all numerical columns except for Price

numerical = ['Rooms', 'Bathrooms', 'Size', 'Store Rooms', 'Places of Worship',\
             'Schools', 'Hospitals', 'Malls', 'Restaurants']

In [11]:
# Scale numerical columns in x_train

scaler = MinMaxScaler()
x_train[numerical] = scaler.fit_transform(x_train[numerical])
x_train.head()

Unnamed: 0,Location,Rooms,Bathrooms,Property Type,Size,Furnishing,Store Rooms,Places of Worship,Schools,Hospitals,Malls,Restaurants
36520,0,0.3333,0.3333,2,0.3248,0,0.5,0.7692,0.9286,0.8261,1.0,0.9078
1863,32,0.1667,0.1667,6,0.2287,0,0.5,0.9744,0.7857,0.8696,0.625,0.8133
18650,25,0.3333,0.3333,7,0.2638,2,0.5,0.5641,0.75,0.0435,0.625,0.5456
43497,60,0.3333,0.1667,6,0.1358,0,0.0,1.0,0.6429,0.913,0.625,0.8078
34544,14,0.6667,0.6667,5,0.6737,1,1.0,0.2564,0.2857,0.0435,0.125,0.0622


In [12]:
# Scale numerical columns in x_test

x_test[numerical] = scaler.transform(x_test[numerical])
x_test.head()

Unnamed: 0,Location,Rooms,Bathrooms,Property Type,Size,Furnishing,Store Rooms,Places of Worship,Schools,Hospitals,Malls,Restaurants
41944,51,0.5,0.6667,2,0.7418,0,0.5,0.5641,0.5357,0.2609,0.375,0.4422
22395,49,0.3333,0.1667,6,0.1204,2,0.0,0.8205,0.4643,1.0,0.25,0.4367
14960,20,0.5,0.6667,5,0.8796,2,0.5,0.1795,0.1429,0.1739,0.125,0.1633
41012,12,0.3333,0.3333,2,0.4303,1,0.5,0.5641,0.6607,0.1739,0.5,0.4733
36204,0,0.3333,0.3333,6,0.1642,0,0.0,0.7692,0.9286,0.8261,1.0,0.9078


## Modelling

### XGBoost

In [15]:
# Grid Search for XGBoost 

XGB = XGBRegressor(random_state=42)

param_grid = {'nthread':[4], 
              'objective':['reg:squarederror'],
              'learning_rate': [0.03, 0.05, 0.07], 
              'max_depth': [10, 15, 20],
              'min_child_weight': [1, 3, 5],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [100, 300, 500]}

XGB_model = model_selection.GridSearchCV(
estimator=XGB,
param_grid=param_grid,
verbose=True,
n_jobs=-1,
cv=3
)

XGB_model.fit(x_train, y_train)
print(f"Best score: {XGB_model.best_score_}")
print("Best parameters set:")
best_parameters = XGB_model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print(f"\t{param_name}: {best_parameters[param_name]}")

Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best score: 0.9021283998632486
Best parameters set:
	colsample_bytree: 0.7
	learning_rate: 0.03
	max_depth: 20
	min_child_weight: 3
	n_estimators: 300
	nthread: 4
	objective: reg:squarederror
	subsample: 0.7


In [13]:
# Build XGBoost model

XGB_model = XGBRegressor(
    random_state=42,
    colsample_bytree=0.7,
    learning_rate=0.03,
    max_depth=20,
    min_child_weight=3,
    n_estimators=300,
    nthread=4,
    objective='reg:squarederror',
    subsample=0.7
)

XGB_model.fit(x_train, y_train)

In [14]:
# Generate predictions

XGB_pred = XGB_model.predict(x_test)
print(XGB_pred[:10])

[3136117.    487180.3  2311325.   1539439.1  1105040.8   593030.2
  471425.5   557929.3  2502789.    678362.06]


In [15]:
# Find metrics
from sklearn import metrics

XGB_metrics = pd.DataFrame ({'Metric':
['MAE',
'MSE',
'RMSE',
'R-Squared'], 'XGBoost':
[metrics.mean_absolute_error(y_test, XGB_pred),
metrics.mean_squared_error (y_test, XGB_pred),
np.sqrt (metrics.mean_squared_error (y_test, XGB_pred)),
metrics.explained_variance_score (y_test, XGB_pred)]})
XGB_metrics

Unnamed: 0,Metric,XGBoost
0,MAE,149206.8041
1,MSE,96963808824.8168
2,RMSE,311390.1232
3,R-Squared,0.9104


In [19]:
from sklearn import metrics

model = XGBRegressor()
model.fit(x_train, y_train)
pred = model.predict(x_test)
metrics = pd.DataFrame ({'Metric':
['MAE',
'MSE',
'RMSE',
'R-Squared'], 'XGBoost':
[metrics.mean_absolute_error(y_test, pred),
metrics.mean_squared_error (y_test, pred),
np.sqrt (metrics.mean_squared_error (y_test, pred)),
metrics.explained_variance_score (y_test, pred)]})
print(metrics)

      Metric           XGBoost
0        MAE       195916.7449
1        MSE 124007861705.4727
2       RMSE       352147.4999
3  R-Squared            0.8854


### Random Forest

In [None]:
# Grid Search for Random Forest

RF = RandomForestRegressor(random_state=42)

param_grid = {
"n_estimators": [200, 500, 1000],
'max_features': [4, 7, 10],
"max_depth": [50, 100],
'min_samples_leaf': [1, 3],
'min_samples_split': [5, 10]
}

RF_model = model_selection.GridSearchCV(
estimator=RF,
param_grid=param_grid,
verbose=10,
n_jobs=-1,
cv=3
)

RF_model.fit(x_train, y_train)
print(f"Best score: {RF_model.best_score_}")
print("Best parameters set:")
best_parameters = RF_model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print(f"\t{param_name}: {best_parameters[param_name]}")

Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [None]:
# Build Random Forest model

RF_model = RandomForestRegressor(
    random_state=42,
    max_depth=50,
    max_features=10,
    min_samples_leaf=1,
    min_samples_split=5,
    n_estimators=200
)
RF_model.fit(x_train,y_train)

In [None]:
# Generate predictions

RF_pred = RF_model.predict(x_test)
print(RF_pred[:10])

In [None]:
# Find metrics
from sklearn import metrics

RF_metrics = pd.DataFrame ({'Metric':
['MAE',
'MSE',
'RMSE',
'R-Squared'], 'Random Forest':
[metrics.mean_absolute_error(y_test, RF_pred),
metrics.mean_squared_error (y_test, RF_pred),
np.sqrt (metrics.mean_squared_error (y_test, RF_pred)),
metrics.explained_variance_score (y_test, RF_pred)]}).round(3)
RF_metrics

In [None]:
from sklearn import metrics

a = RandomForestRegressor(random_state=42)
a.fit(x_train, y_train)
b = a.predict(x_test)
c = pd.DataFrame ({'Metric':
['MAE',
'MSE',
'RMSE',
'R-Squared'], 'Value':
[metrics.mean_absolute_error (y_test, b),
metrics.mean_squared_error (y_test, b),
np.sqrt (metrics.mean_squared_error (y_test, b)),
metrics.explained_variance_score (y_test, b)]}).round(3)
print(c)

In [None]:
# Set the color palette to 'rocket'
rocket_colors = sns.color_palette('rocket')

# Select the color at index 4
col1 = rocket_colors[5]
col2 = rocket_colors[1]

In [None]:
# Create dataframe for XGBoost and Random Forest metrics

metrics = pd.merge(XGB_metrics, RF_metrics, on='Metric', how='inner')
metrics.set_index('Metric', inplace=True)
metrics

In [None]:
# Plotting the four metrics using subplots
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Set the overall title
fig.suptitle('Comparison between Metrics', fontsize=16)

# Plot MAE
axes[0, 0].bar(metrics.columns, metrics.loc['MAE'], color=[col1, col2])
axes[0, 0].set_title('MAE')
axes[0, 0].set_ylabel('Value')
for i, v in enumerate(metrics.loc['MAE']):
    axes[0, 0].text(i, v, str(v), ha='center', va='bottom')

# Plot MSE
axes[0, 1].bar(metrics.columns, metrics.loc['MSE'], color=[col1, col2])
axes[0, 1].set_title('MSE')
axes[0, 1].set_ylabel('Value')
for i, v in enumerate(metrics.loc['MSE']):
    axes[0, 1].text(i, v, str(v), ha='center', va='bottom')

# Plot RMSE
axes[1, 0].bar(metrics.columns, metrics.loc['RMSE'], color=[col1, col2])
axes[1, 0].set_title('RMSE')
axes[1, 0].set_ylabel('Value')
for i, v in enumerate(metrics.loc['RMSE']):
    axes[1, 0].text(i, v, str(v), ha='center', va='bottom')

# Plot R-Squared
axes[1, 1].bar(metrics.columns, metrics.loc['R-Squared'], color=[col1, col2])
axes[1, 1].set_title('R-Squared')
axes[1, 1].set_ylabel('Value')
for i, v in enumerate(metrics.loc['R-Squared']):
    axes[1, 1].text(i, v, str(v), ha='center', va='bottom')

# Adjust spacing between subplots
plt.tight_layout()

# Show the plots
plt.show()

In [None]:
# Get the feature importance scores from the trained XGBoost model
feature_importance = XGB_model.feature_importances_

# Get the names of the columns from your dataset
column_names = ['Location', 'Rooms', 'Bathrooms', 'Property type',\
                'Size', 'Furnishing', 'Store Rooms', 'Places of Worship',\
                'Schools', 'Hospitals', 'Malls', 'Restaurants']

# Create a DataFrame to store the feature importance scores along with the column names
feature_importance_df = pd.DataFrame({'Feature': column_names, 'Importance': feature_importance})

# Sort the DataFrame in descending order of importance scores
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot the feature importance
plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance_df, x='Importance', y='Feature', palette='rocket_r')
for i, v in enumerate(feature_importance_df['Importance']):
    plt.text(v, i, str(round(v, 4)), va='center')
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

In [None]:
d = [['ampang', 5, 3, 'Condominium', 1900, 'Partly Furnished',\
    0, 53, 24, 20, 5, 813]]

j = pd.DataFrame(d, columns=['Location', 'Rooms', 'Bathrooms', 'Property Type',\
    'Size', 'Furnishing', 'Store Rooms', 'Places of Worship', 'Schools',\
    'Hospitals', 'Malls', 'Restaurants'])
j

In [None]:
j[numerical] = scaler.transform(j[numerical])
j

In [None]:
j['Location'] = le_location.transform(j['Location'])
j['Property Type'] = le_proptype.transform(j['Property Type'])
j['Furnishing'] = le_furnishing.transform(j['Furnishing'])

In [None]:
j

In [None]:
XGB_model.predict(j)

In [None]:
# Save XGBoost model, Label Encoders and MinMaxScaler

joblib.dump(XGB_model, 'model')
joblib.dump(le_location, 'le_location')
joblib.dump(le_proptype, 'le_proptype')
joblib.dump(le_furnishing, 'le_furnishing')
joblib.dump(scaler, 'scaler')

In [None]:
#data = {
    #'model': XGB_model,
    #'le_location': le_location,
    #'le_proptype': le_proptype,
    #'le_furnishing': le_furnishing
#}

#with open('saved_steps.pkl', 'wb') as file:
    #pickle.dump(data, file)