In [5]:
#Import packages
import pandas as pd
import requests
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error,mean_absolute_error,mean_absolute_percentage_error,explained_variance_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
import statsmodels as sm
import statsmodels.api as smf  
from statsmodels.formula.api import ols #formula guide: https://www.statsmodels.org/dev/examples/notebooks/generated/formulas.html

In [71]:
#Global Values

#resale fp
RSF_fp="Relevant_datasets/RSF90onw_wAddress.csv"

To predict a resale flat's price transaction in 2014, i will use: 
1. A ridge regression model
A ridge regression shrinks the coefficients of variables rather than remove them (in a lasso). Given that i am expecting a lot of dummy variables for my town variable, i can't remove those variables, but i am also concerned about having too many variables resulting in an overfitted model which could worsen the variance of the model and hinder its performance when met with new data.

2. A Decision Tree regression model

3. A Random forest tree regression model


I will also assume that all i can use is town,flat_age and flat_type

### Data Cleaning

In [100]:
#Import Resale Flat data
RSF=pd.read_csv(RSF_fp,index_col=0,low_memory=False)

In [101]:
##Data manipulation


# To allow faster training of model , to utilise only 3 years of data
RSF=RSF.loc[(RSF['Year_x']<2020)&(RSF['Year_x']>=2017)]

#Create a variable for flat age
RSF['flat_age']=RSF['Year_x']-RSF['lease_commence_date']

##Different dataset was added after coding, town_x was the name of the variable for town, hence will rename town to town_x for simplicity sake
RSF=RSF.rename({'town':'town_x'},axis=1)

#to assume a linear relationship btw flat type and resale price.
#assume executive>multi-gen>5 room in general in terms of resale price
RSF['flat_type_n']=0
RSF.loc[RSF['flat_type']=='1 ROOM','flat_type_n']=1
RSF.loc[RSF['flat_type']=='2 ROOM','flat_type_n']=2
RSF.loc[RSF['flat_type']=='3 ROOM','flat_type_n']=3
RSF.loc[RSF['flat_type']=='4 ROOM','flat_type_n']=4
RSF.loc[RSF['flat_type']=='5 ROOM','flat_type_n']=5
RSF.loc[RSF['flat_type']=='MULTI-GENERATION','flat_type_n']=6
RSF.loc[RSF['flat_type']=='MULTI GENERATION','flat_type_n']=6
RSF.loc[RSF['flat_type']=='EXECUTIVE','flat_type_n']=7
print("no. of unchanged rows:",RSF.loc[RSF['flat_type_n']==0]['flat_type_n'].count())


no. of unchanged rows: 0


In [102]:
#create a list of unique towns for easy reference
town_list=list(RSF['town_x'].unique())
#remove one town (e.g.Ang_moh_kio) from list before adding to formula to prevent multicollinearity (1 town needs to be absent during regression)
RSF_dummies = pd.get_dummies(RSF,columns=['town_x'],drop_first=True)

#create a list of column names for each dummy for ease of reference
town_list = ['town_x_' + x for x in town_list]
#pd.get_dummies will remove the first column, AKA 'town_ANG MO KIO'
town_list.remove('town_x_ANG MO KIO')
#create a dummy of towns


In [103]:
RSF_dummies.columns

Index(['month', 'flat_type', 'block', 'street_name_x', 'storey_range',
       'floor_area_sqm', 'flat_model', 'lease_commence_date', 'resale_price',
       'remaining_lease', 'street_name_y', 'Lat', 'Lon', 'street_names_oldver',
       'Coordinates', 'rLat', 'rLon', 'Cl_MRT', 'Distance_MRT_km', 'Year_x',
       'Quarter_x', 'Yr_Qt', 'Year_y', 'Quarter_y', 'Index',
       'Real_resale_price', 'flat_age', 'flat_type_n', 'town_x_BEDOK',
       'town_x_BISHAN', 'town_x_BUKIT BATOK', 'town_x_BUKIT MERAH',
       'town_x_BUKIT PANJANG', 'town_x_BUKIT TIMAH', 'town_x_CENTRAL AREA',
       'town_x_CHOA CHU KANG', 'town_x_CLEMENTI', 'town_x_GEYLANG',
       'town_x_HOUGANG', 'town_x_JURONG EAST', 'town_x_JURONG WEST',
       'town_x_KALLANG/WHAMPOA', 'town_x_MARINE PARADE', 'town_x_PASIR RIS',
       'town_x_PUNGGOL', 'town_x_QUEENSTOWN', 'town_x_SEMBAWANG',
       'town_x_SENGKANG', 'town_x_SERANGOON', 'town_x_TAMPINES',
       'town_x_TOA PAYOH', 'town_x_WOODLANDS', 'town_x_YISHUN'],
      dt

In [104]:
#convert boolean to int for dummy variables to ease regression
for i in town_list:
    RSF_dummies[i] = RSF_dummies[i].astype(int)

In [105]:
#creating strings of the list of towns to make copying of columns easier
for i in town_list:
    str_cols= "'"+"','".join(town_list)+"'"

In [106]:
RSF_dummies.columns

Index(['month', 'flat_type', 'block', 'street_name_x', 'storey_range',
       'floor_area_sqm', 'flat_model', 'lease_commence_date', 'resale_price',
       'remaining_lease', 'street_name_y', 'Lat', 'Lon', 'street_names_oldver',
       'Coordinates', 'rLat', 'rLon', 'Cl_MRT', 'Distance_MRT_km', 'Year_x',
       'Quarter_x', 'Yr_Qt', 'Year_y', 'Quarter_y', 'Index',
       'Real_resale_price', 'flat_age', 'flat_type_n', 'town_x_BEDOK',
       'town_x_BISHAN', 'town_x_BUKIT BATOK', 'town_x_BUKIT MERAH',
       'town_x_BUKIT PANJANG', 'town_x_BUKIT TIMAH', 'town_x_CENTRAL AREA',
       'town_x_CHOA CHU KANG', 'town_x_CLEMENTI', 'town_x_GEYLANG',
       'town_x_HOUGANG', 'town_x_JURONG EAST', 'town_x_JURONG WEST',
       'town_x_KALLANG/WHAMPOA', 'town_x_MARINE PARADE', 'town_x_PASIR RIS',
       'town_x_PUNGGOL', 'town_x_QUEENSTOWN', 'town_x_SEMBAWANG',
       'town_x_SENGKANG', 'town_x_SERANGOON', 'town_x_TAMPINES',
       'town_x_TOA PAYOH', 'town_x_WOODLANDS', 'town_x_YISHUN'],
      dt

In [107]:
RSF_ML=RSF_dummies[['Real_resale_price','flat_type_n','flat_age','town_x_BEDOK','town_x_BISHAN','town_x_BUKIT BATOK','town_x_BUKIT MERAH','town_x_BUKIT PANJANG','town_x_BUKIT TIMAH','town_x_CENTRAL AREA','town_x_CHOA CHU KANG','town_x_CLEMENTI','town_x_GEYLANG','town_x_HOUGANG','town_x_JURONG EAST','town_x_JURONG WEST','town_x_KALLANG/WHAMPOA','town_x_MARINE PARADE','town_x_PASIR RIS','town_x_PUNGGOL','town_x_QUEENSTOWN','town_x_SEMBAWANG','town_x_SENGKANG','town_x_SERANGOON','town_x_TAMPINES','town_x_TOA PAYOH','town_x_WOODLANDS','town_x_YISHUN']]

In [116]:
#Create a train and test set
X_train, X_test, y_train, y_test = train_test_split(RSF_ML.drop('Real_resale_price', axis=1), RSF_ML['Real_resale_price'], test_size=0.25, random_state=100)

#normalize data variance and mean with standard scaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)


### Ridge Regression Model

Was deciding between mean square error and mean absolute error, but a trial of mean square error proved that it is probably an inferior metric for evaluating the performance of resale flat price which has a high value. Errors within MSE would hence be exponentially increase, hence opted for MAE.

In [120]:
#create 20 evenly spaced integers from 1-100
lin_space=np.linspace(1, 100, 10)

#determine parameters to test n_estimators, min_samples_leaf

params=[{'alpha': lin_space}]

scorer=['neg_mean_absolute_error','neg_mean_absolute_percentage_error','explained_variance']

RG_GS = GridSearchCV(Ridge(),param_grid=params,
                     scoring=scorer,
                     refit='neg_mean_absolute_error',
                     cv=5,n_jobs=-1)

RG_GS.fit(X_train, y_train)
RG_GS.best_params_
RG_GS.score(X_train, y_train)
RG_GS_best_estimator = RG_GS.best_estimator_

print(f"Best estimator has alpha value of {RG_GS.best_params_['alpha']}  estimators")
y_pred = RG_GS_best_estimator.predict(X_test)

print("mean absolute error: ",mean_absolute_error(y_test, y_pred))
print("mean absolute percentage error: ",mean_absolute_percentage_error(y_test, y_pred))
print("explained variance score: ",explained_variance_score(y_test, y_pred))

Best estimator has alpha value of 100.0  estimators
mean absolute error:  76156.41561282442
mean absolute percentage error:  0.12746701790122178
explained variance score:  0.7818842375261099


### Decision tree Model

In [122]:
#create 10 evenly spaced integers from 1-10
lin_space=np.linspace(1, 10, 4)
lin_space=[int(x) for x in lin_space]

#determine parameters to test n_estimators, min_samples_leaf

params=[{'min_samples_leaf': [1,10,20],
         'criterion':['squared_error', 'absolute_error'],
         'min_samples_split': lin_space}]

scorer=['neg_mean_absolute_error','neg_mean_absolute_percentage_error','explained_variance']

DTM_GS = GridSearchCV(DecisionTreeRegressor(),
                      param_grid=params,
                      scoring=scorer,
                      refit='neg_mean_absolute_error',
                      cv=5,
                      n_jobs=-1)

DTM_GS.fit(X_train, y_train)
DTM_GS.best_params_
DTM_GS.score(X_train, y_train)  
DTM_GS_best_estimator = DTM_GS.best_estimator_

print(f"Best estimator has {DTM_GS.best_params_['min_samples_leaf']} minimum samples per leaf,uses {DTM_GS.best_params_['criterion']} criterion and {DTM_GS.best_params_['min_samples_split']} splits")
y_pred = DTM_GS_best_estimator.predict(X_test)
mae=mean_absolute_error(y_test, y_pred)
print("mean absolute error: ",mean_absolute_error(y_test, y_pred))
print("mean absolute percentage error: ",mean_absolute_percentage_error(y_test, y_pred))
print("explained variance score: ",explained_variance_score(y_test, y_pred))

30 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\marcu\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\marcu\anaconda3\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\marcu\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\marcu\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParamete

Best estimator has 1 minimum samples per leaf,uses absolute_error criterion and 7 splits
mean absolute error:  48704.960546689676
mean absolute percentage error:  0.07811538633735214
explained variance score:  0.8937995635690524


### Random forest tree Model

In [124]:
#create 20 evenly spaced integers from 1-100
lin_space=np.linspace(1, 100, 10)
lin_space=[int(x) for x in lin_space]

#determine parameters to test n_estimators, min_samples_leaf
params=[{'n_estimators': lin_space,
         'min_samples_leaf': [1,5,10,20]}]

RFM_GS = GridSearchCV(RandomForestRegressor(),
                      param_grid=params,
                      scoring='neg_mean_absolute_error',
                      cv=5,n_jobs=-1)

RFM_GS.fit(X_train, y_train)
RFM_GS.best_params_
RFM_GS.score(X_train, y_train)
RFM_GS_best_estimator = RFM_GS.best_estimator_

print(f"Best estimator has {RFM_GS.best_params_['min_samples_leaf']} minimum samples per leaf and {RFM_GS.best_params_['n_estimators']} estimators")
print()
y_pred = RFM_GS_best_estimator.predict(X_test)
mae=mean_absolute_error(y_test, y_pred)
print("mean absolute error: ",mean_absolute_error(y_test, y_pred))
print("mean absolute percentage error: ",mean_absolute_percentage_error(y_test, y_pred))
print("explained variance score: ",explained_variance_score(y_test, y_pred))

Best estimator has 1 minimum samples per leaf and 78 estimators

mean absolute error:  49000.51294511904
mean absolute percentage error:  0.07986166840949911
explained variance score:  0.898599688214107
