##### References
https://www.teoalida.com/singapore/hdbflattypes/ : HDB Flat Information

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sb

##### Data Preparation
* Dataset are spread across 5 csv files

In [None]:
# df_90to99 = pd.read_csv('../DataSets/SingaporeResaleFlatData/resale-flat-prices-based-on-approval-date-1990-1999.csv')
# df_00to12 = pd.read_csv('../DataSets/SingaporeResaleFlatData/resale-flat-prices-based-on-approval-date-2000-feb-2012.csv')
df_12to14 = pd.read_csv('../DataSets/1_SingaporeResaleFlatData/resale-flat-prices-based-on-registration-date-from-mar-2012-to-dec-2014.csv')
df_1516 = pd.read_csv('../DataSets/1_SingaporeResaleFlatData/resale-flat-prices-based-on-registration-date-from-jan-2015-to-dec-2016.csv')
df_17 = pd.read_csv('../DataSets/1_SingaporeResaleFlatData/resale-flat-prices-based-on-registration-date-from-jan-2017-onwards.csv')


In [None]:
df_test = pd.concat([df_12to14,df_1516,df_17], axis = 0)

In [None]:
df_test[df_test['remaining_lease'].notnull()]['remaining_lease']

In [None]:
[]

In [None]:
# df_00to12
# df_90to99
df = pd.concat([df_12to14,df_1516,df_17], axis = 0)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df['month'] = pd.to_datetime(df['month'])

In [None]:
df.isnull().sum()

##### HDB Flats in Singapore have a lease of 99 years
Therefore, remaining_lease = 99 - (year - lease_commence_date)

In [None]:
df.rename(columns = {'month':'date'}, inplace = True)

In [None]:
df['year'] = [i.year for i in df['date']]
df['month'] = [i.month for i in df['date']]

In [None]:
df.drop('date', axis = 1, inplace = True)
df.reset_index(inplace = True, drop = True)

In [None]:
df.head()

##### Column 'remaining_lease' contains some data in format : '61 years 04 months'. Need to standardize to just year

In [None]:
mask = (df['remaining_lease'].notnull())
for i in df.loc[mask, 'remaining_lease']:
    if not isinstance(i, int):
        df.loc[mask, 'remaining_lease'] = int(i[:2])

In [None]:
99 - (df.loc[0]['year'] - df.loc[0]['lease_commence_date'])

In [None]:
for index in df[df['remaining_lease'].isna()].index:
    df.loc[index, 'remaining_lease'] = 99 - (int(df.loc[index]['year'] - df.loc[index]['lease_commence_date']))

In [None]:
df.isnull().sum()

##### Problem Definition
Finding the features affecting resale price and attempt to predict resale prices based on these features

##### Features Selected
* town
* flat_type
* storey_range
* floor_area_sqm
* flat_model
* remaining_lease

##### Label
* Resale Price

In [None]:
df.columns

In [None]:
df_subset = df[['town', 'flat_type', 'storey_range',
       'floor_area_sqm', 'flat_model',
       'remaining_lease','resale_price']]

In [None]:
df_subset['remaining_lease'] = df_subset['remaining_lease'].astype('int64')

In [None]:
df_subset.head()

In [None]:
df_subset['town'].nunique()

In [None]:
df_subset['town'].unique()

##### Breaking down town into respective regions in Singapore

In [None]:
north_df = df_subset[df_subset['town'].isin(['ANG MO KIO','SEMBAWANG','WOODLANDS','YISHUN'])]
east_df = df_subset[df_subset['town'].isin(['BEDOK','GEYLANG','HOUGANG','PASIR RIS','PUNGGOL','SENGKANG','SERANGOON','TAMPINES'])]
south_df = df_subset[df_subset['town'].isin(['QUEENSTOWN',])]
west_df = df_subset[df_subset['town'].isin(['BUKIT BATOK','BUKIT PANJANG','CHOA CHU KANG','CLEMENTI','JURONG EAST','JURONG WEST'])]
central_df = df_subset[df_subset['town'].isin(['BISHAN','BUKIT MERAH','BUKIT TIMAH','CENTRAL AREA','KALLANG/WHAMPOA','MARINE PARADE','TOA PAYOH'])]

In [None]:
print(north_df.shape[0] / df_subset.shape[0] * 100)
print(east_df.shape[0] / df_subset.shape[0] * 100)
print(south_df.shape[0] / df_subset.shape[0] * 100)
print(west_df.shape[0] / df_subset.shape[0] * 100)
print(central_df.shape[0] / df_subset.shape[0] * 100)

##### Analyzing East Regions

In [None]:
east_df.head()

In [None]:
plt.figure(figsize = (15,10))
sb.boxplot(x = 'resale_price', y = 'town', data = east_df,
          order = east_df.groupby('town').median()['resale_price'].sort_values().index,
          palette = 'autumn_r')
plt.title('East Region Resale Prices by Town', fontsize = 15)
plt.xlabel('Town', fontsize = 15)
plt.ylabel('Resale Price', fontsize = 15)
plt.xticks(fontsize = 12);

In [None]:
plt.figure(figsize = (15,10))
sb.boxplot(x = 'resale_price', y = 'flat_type', data = east_df,
          palette = 'autumn_r')

plt.title('East Region Resale Prices by Flat Type', fontsize = 15)
plt.xlabel('Flat Type', fontsize = 15)
plt.ylabel('Resale Price', fontsize = 15)
plt.xticks(fontsize = 12);

In [None]:
east_df.corr()

In [None]:
# plt.figure(figsize = (5,4))
sb.heatmap(east_df.corr(),
          cmap = 'Blues',
          annot = True,
          linewidth = 1,
          linecolor = 'black')

In [None]:
plt.figure(figsize = (8,6))
sb.scatterplot(x = 'floor_area_sqm', y = 'resale_price', data = east_df)

plt.title('Scatterplot - Resale Price vs Floor Area')
plt.xlabel('Floor Area', fontsize = 12)
plt.ylabel('Resale Price', fontsize = 12);

### Linear Regression for Floor Area and Resale Price

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
east_df.head(2)

In [None]:
x = east_df.loc[:, ['floor_area_sqm']].values

In [None]:
x.shape

In [None]:
y = east_df.loc[:, ['resale_price']]

In [None]:
y.shape

In [None]:
reg = LinearRegression(fit_intercept = True)

In [None]:
reg.fit(x, y)

In [None]:
east_pred_df = east_df[['floor_area_sqm', 'resale_price']]

In [None]:
east_pred_df['pred_price'] = reg.predict(east_pred_df['floor_area_sqm'].values.reshape(-1,1))

In [None]:
reg.score(x,y)

In [None]:
reg.coef_[0]

In [None]:
reg.intercept_

In [None]:
print("formula : y = {:.2f}x + {:.2f}".format(reg.coef_[0][0],reg.intercept_[0]))

In [None]:
fig, ax = plt.subplots(nrows = 1, ncols = 1, figsize = (10,7));

ax.scatter(x, y, c = 'black');
ax.plot(x, reg.predict(x), c = 'red', linewidth = 3)
ax.grid(True,
       axis = 'both',
       zorder = 0,
       linestyle = ':',
       c = 'k')

ax.tick_params(labelsize = 18)
ax.set_xlabel('Floor Area', fontsize = 24)
ax.set_ylabel('Resale Price', fontsize = 24)
ax.set_title('Linear Regression Model')
fig.tight_layout()

In [None]:
east_df[['floor_area_sqm', 'resale_price']].corr()

In [None]:
east_df.head()

# Multivariate Linear Regression
* Using Floor Area and Remaining Lease to predict Resale Price

In [None]:
east_df_multivariate = east_df[['floor_area_sqm', 'remaining_lease', 'resale_price']]

In [None]:
east_df_multivariate.corr()

In [None]:
sb.scatterplot(x = "remaining_lease", y = "resale_price", data = east_df_multivariate)

In [None]:
sb.set_style('darkgrid')

In [None]:
len(g.axes[0])

In [None]:
g = sb.pairplot(east_df_multivariate, x_vars = ['floor_area_sqm', 'remaining_lease'], y_vars = 'resale_price', height = 7, aspect = 1)
plt.subplots_adjust(wspace = 0.1)
xlabel = ['Floor Area', 'Remaining Lease']
fontsize = 15
g.axes[0][0].set_xlabel('Floor Area', fontsize = fontsize)

for i in range(len(g.axes[0])):
    g.axes[0][i].set_xlabel(xlabel[i], fontsize = fontsize)
    g.axes[0][i].set_ylabel('Resale Price', fontsize = fontsize)
    g.axes[0][i].tick_params(axis = 'x', labelsize = 14)
    g.axes[0][i].tick_params(axis = 'y', labelsize = 14)

In [None]:
x = east_df_multivariate[['floor_area_sqm', 'remaining_lease']]

In [None]:
x.shape

In [None]:
y = east_df_multivariate['resale_price'].values

In [None]:
y.shape

In [None]:
m_reg = LinearRegression(fit_intercept = True)

In [None]:
m_reg.fit(x, y)

In [None]:
m_reg.coef_ # Slope for 2 variables

In [None]:
m_reg.intercept_ # Y intercept

In [None]:
print("formula : y = {:.2f}x1 + {:.2f}x2 + {:.2f}".format(m_reg.coef_[0],m_reg.coef_[1],m_reg.intercept_))

In [None]:
m_reg.score(x,y)

In [None]:
east_df_multivariate['pred_prices'] = m_reg.predict(x)

In [None]:
east_df_multivariate.head(20)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
mean_squared_error(y, east_df_multivariate['pred_prices'].values)

In [None]:
import math

In [None]:
math.sqrt(mean_squared_error(y, east_df_multivariate['pred_prices'].values))

### Regression Tree to predict Resale Price


##### variables to use
* Town
* Flat_type
* storey_range
* Floor_Area
* Flat_Model
* Remaining_lease
* Resale Price

In [None]:
east_df.columns

In [None]:
X = east_df[['town', 'flat_type', 'storey_range', 'floor_area_sqm', 'flat_model', 'remaining_lease']]
y = east_df['resale_price']

In [None]:
X.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le_town = LabelEncoder()
le_flat_type = LabelEncoder()
le_storey_range = LabelEncoder()
le_flat_model = LabelEncoder()

In [None]:
le_town.fit(X['town'])
le_flat_type.fit(X['flat_type'])
le_storey_range.fit(X['storey_range'])
le_flat_model.fit(X['flat_model'])

In [None]:
le_town_mapping = dict(zip(le_town.classes_,le_town.transform(le_town.classes_)))
le_flat_type_mapping = dict(zip(le_flat_type.classes_,le_flat_type.transform(le_flat_type.classes_)))
le_storey_range_mapping = dict(zip(le_storey_range.classes_,le_storey_range.transform(le_storey_range.classes_)))
le_flat_model_mapping = dict(zip(le_flat_model.classes_,le_flat_model.transform(le_flat_model.classes_)))

In [None]:
le_flat_model_mapping

In [None]:
X['le_town'] = le_town.fit_transform(X['town'])
X['le_flat_type'] = le_flat_type.fit_transform(X['flat_type'])
X['le_storey_range'] = le_storey_range.fit_transform(X['storey_range'])
X['le_flat_model'] = le_flat_model.fit_transform(X['flat_model'])

In [None]:
X.head()

In [None]:
X.drop(columns = ['town', 'flat_type', 'storey_range', 'flat_model'], inplace = True)

In [None]:
X.head()

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Obtaining Optimum Depth

In [None]:
depth = 15
accuracy = []

for depth in range(1,15):
    rgs = DecisionTreeRegressor(max_depth = depth)
    rgs.fit(X_train, y_train)
    accuracy.append(rgs.score(X_test, y_test))
    

In [None]:
fig, ax = plt.subplots(nrows = 1, ncols = 1, figsize = (10,7))
ax.plot([i for i in range(1,15)],
       accuracy,
       lw = 2,
       color = 'k')

ax.grid(True,
       axis = 'both',
       zorder = 0,
       linestyle = ':',
       color = 'k')

ax.tick_params(labelsize = 16)
ax.set_xlabel('max_depth', fontsize = 15)
ax.set_ylabel('accuracy', fontsize = 15)
fig.tight_layout()

In [None]:
rgs = DecisionTreeRegressor(max_depth = 15, min_samples_leaf = 0.1)

In [None]:
rgs.fit(X_train, y_train)

### Assessing Prediction

In [None]:
check = X_test.copy()

In [None]:
check['prediction'] = rgs.predict(X_test)

In [None]:
check['actual'] = y_test

In [None]:
check.head(20)

In [None]:
check['difference'] = round((abs(check['actual'] - check['prediction']) / check['actual']) * 100, 2)

In [None]:
check[check['difference'] < 20]

In [None]:
rgs.score(X_test,  y_test)

##### Accuracy may not be a good metric for analyzing regression trees

In [None]:
from sklearn.metrics import mean_squared_error as MSE

In [None]:
MSE(y_test, rgs.predict(X_test))

In [None]:
check

### Visualizing Tree

In [None]:
from sklearn import tree

In [None]:
fig, axes = plt.subplots(nrows = 1, ncols = 1, figsize = (4,4), dpi = 300)

tree.plot_tree(rgs);

In [None]:
X.head()