## 1. Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [2]:
pd.set_option('display.max_rows', 1000); pd.set_option('display.max_columns', 1000); pd.set_option('display.width', 1000)

## 2. Functions

In [None]:
# Random Forest
def train_model(model, x_train, y_train, x_test, y_test, param_grid):
    # Fit model
    grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)
    grid_search.fit(x_train, y_train)
    # Generate predictions
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(x_test)
    # MSE Score
    mse = mean_squared_error(y_test, y_pred)
    # RMSE Score
    rmse = np.sqrt(mse)
    # R^2 Score
    r2 = r2_score(y_test, y_pred)
    
    print(f'Root Mean Squared Error (RMSE): {rmse}, R² Score = {r2}')

    return best_model

## 3. Datasets

In [4]:
# Load approval date datasets
tmp_path = "./Kaggle_HDB/" 
approval_files = [
    tmp_path + str(x) for x in os.listdir(tmp_path) if \
        x.startswith("resale-flat-prices-based-on-approval-date-")] 

tmp_approval_data = [pd.read_csv(tmp_file) for tmp_file in approval_files ]
tmp_approval_data = pd.concat(tmp_approval_data)

In [5]:
tmp_approval_data.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price
0,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,IMPROVED,1977,9000.0
1,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,04 TO 06,31.0,IMPROVED,1977,6000.0
2,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,IMPROVED,1977,8000.0
3,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,07 TO 09,31.0,IMPROVED,1977,6000.0
4,1990-01,ANG MO KIO,3 ROOM,216,ANG MO KIO AVE 1,04 TO 06,73.0,NEW GENERATION,1976,47200.0


In [6]:
# Load registration date datasets
tmp_path = "./Kaggle_HDB/" 
registration_files = [
    tmp_path + str(x) for x in os.listdir(tmp_path) if \
        x.startswith("resale-flat-prices-based-on-registration-date-")] 

tmp_registration_data = [pd.read_csv(tmp_file) for tmp_file in registration_files ]
tmp_registration_data = pd.concat(tmp_registration_data)

In [7]:
tmp_registration_data.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price
0,2015-01,ANG MO KIO,3 ROOM,174,ANG MO KIO AVE 4,07 TO 09,60.0,Improved,1986,70,255000.0
1,2015-01,ANG MO KIO,3 ROOM,541,ANG MO KIO AVE 10,01 TO 03,68.0,New Generation,1981,65,275000.0
2,2015-01,ANG MO KIO,3 ROOM,163,ANG MO KIO AVE 4,01 TO 03,69.0,New Generation,1980,64,285000.0
3,2015-01,ANG MO KIO,3 ROOM,446,ANG MO KIO AVE 10,01 TO 03,68.0,New Generation,1979,63,290000.0
4,2015-01,ANG MO KIO,3 ROOM,557,ANG MO KIO AVE 10,07 TO 09,68.0,New Generation,1980,64,290000.0


## 4. EDA

### a. Approval Datasets

In [8]:
# Check for missing data and col datatypes
tmp_approval_data.info() # No missing values

<class 'pandas.core.frame.DataFrame'>
Index: 656851 entries, 0 to 369650
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   month                656851 non-null  object 
 1   town                 656851 non-null  object 
 2   flat_type            656851 non-null  object 
 3   block                656851 non-null  object 
 4   street_name          656851 non-null  object 
 5   storey_range         656851 non-null  object 
 6   floor_area_sqm       656851 non-null  float64
 7   flat_model           656851 non-null  object 
 8   lease_commence_date  656851 non-null  int64  
 9   resale_price         656851 non-null  float64
dtypes: float64(2), int64(1), object(7)
memory usage: 55.1+ MB


In [9]:
# Check for Labels - standardise flat_type labels MULTI GENERATION to MULTI-GENERATION
tmp_approval_data['flat_type'].unique()
tmp_approval_data['flat_type'] = tmp_approval_data['flat_type'].str.replace('MULTI GENERATION', 'MULTI-GENERATION')

In [10]:
# Check for Labels - standardise flat_model labels uppercase and lowercase
tmp_approval_data['flat_model'].unique()
tmp_approval_data['flat_model'] = tmp_approval_data['flat_model'].str.upper()

In [11]:
# Check for duplicates
tmp_approval_data.duplicated() # No duplicates

0         False
1         False
2         False
3         False
4         False
          ...  
369646    False
369647    False
369648    False
369649    False
369650    False
Length: 656851, dtype: bool

### b. Registration Datasets

In [12]:
# Check for missing data and col datatypes
tmp_registration_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 169730 entries, 0 to 52202
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   month                169730 non-null  object 
 1   town                 169730 non-null  object 
 2   flat_type            169730 non-null  object 
 3   block                169730 non-null  object 
 4   street_name          169730 non-null  object 
 5   storey_range         169730 non-null  object 
 6   floor_area_sqm       169730 non-null  float64
 7   flat_model           169730 non-null  object 
 8   lease_commence_date  169730 non-null  int64  
 9   remaining_lease      117527 non-null  object 
 10  resale_price         169730 non-null  float64
dtypes: float64(2), int64(1), object(8)
memory usage: 15.5+ MB


In [13]:
# There are missing values in remaining_lease col
# Best is to consult the department in charge or data provider for the missing values
# If can't, drop rows with missing values
tmp_registration_data = tmp_registration_data.dropna(how='any').reset_index(drop=True) #30.76% of the dataset has been removed

In [14]:
# Check for duplicates
tmp_registration_data.duplicated() # No duplicates

0         False
1         False
2         False
3         False
4         False
          ...  
117522    False
117523    False
117524    False
117525    False
117526    False
Length: 117527, dtype: bool

In [15]:
# Check for Labels - standardise flat_model labels to uppercase
tmp_registration_data['flat_model'].unique()
tmp_registration_data['flat_model'] = tmp_registration_data['flat_model'].str.upper()

## 5. Data Processing

Assumption 1: The file labelled 'approval date' follows the old naming convention while the current file is named 'registration date’.

Assumption 2: All categorical columns in the datasets contain the correct categories

### a. Find the total lease (years) for each flat in registration data dataset

In [16]:
# Convert month to datetime format and extract the month and year into new columns
registration_data = tmp_registration_data.copy(deep=True)
registration_data['month'] = pd.to_datetime(registration_data['month'])
registration_data['mm'] =  registration_data['month'].dt.month
registration_data['yy'] =  registration_data['month'].dt.year

In [17]:
#  Find the period (years) from lease commencement to stated month
registration_data['years_since_commence'] = registration_data['yy'] - registration_data['lease_commence_date']

# If remaining_lease col contains only numeric values, assume it as years and assign to remaining_lease_years col
# Else, contains words like 'years', extract the value and assign to remaining_lease_years col
registration_data['remaining_lease_years'] = pd.to_numeric(registration_data['remaining_lease'], errors='coerce')
registration_data['remaining_lease_years'] = registration_data['remaining_lease_years'].fillna(
    registration_data['remaining_lease'].str.extract('(\d+) years')[0].astype(float))

# If remaining_lease col contains words like 'months', extract the value and assign to remaining_lease_months col
# Else, assgin 0 to remaining_lease_months col
registration_data['remaining_lease_months'] = registration_data['remaining_lease'].str.extract('(\d+) months').astype(float)
registration_data['remaining_lease_months'] = registration_data['remaining_lease_months'].fillna(0)

# Find total lease in years
registration_data['remaining_lease_in_years'] = registration_data['remaining_lease_years'] + (registration_data['remaining_lease_months'] / 12)
registration_data['total_lease_in_years'] = registration_data['years_since_commence'] + registration_data['remaining_lease_in_years']

In [18]:
registration_data.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,mm,yy,years_since_commence,remaining_lease_years,remaining_lease_months,remaining_lease_in_years,total_lease_in_years
0,2015-01-01,ANG MO KIO,3 ROOM,174,ANG MO KIO AVE 4,07 TO 09,60.0,IMPROVED,1986,70,255000.0,1,2015,29,70.0,0.0,70.0,99.0
1,2015-01-01,ANG MO KIO,3 ROOM,541,ANG MO KIO AVE 10,01 TO 03,68.0,NEW GENERATION,1981,65,275000.0,1,2015,34,65.0,0.0,65.0,99.0
2,2015-01-01,ANG MO KIO,3 ROOM,163,ANG MO KIO AVE 4,01 TO 03,69.0,NEW GENERATION,1980,64,285000.0,1,2015,35,64.0,0.0,64.0,99.0
3,2015-01-01,ANG MO KIO,3 ROOM,446,ANG MO KIO AVE 10,01 TO 03,68.0,NEW GENERATION,1979,63,290000.0,1,2015,36,63.0,0.0,63.0,99.0
4,2015-01-01,ANG MO KIO,3 ROOM,557,ANG MO KIO AVE 10,07 TO 09,68.0,NEW GENERATION,1980,64,290000.0,1,2015,35,64.0,0.0,64.0,99.0


In [19]:
# Get the total lease (years) for each flat
total_lease = registration_data.groupby(['block', 'street_name','lease_commence_date'])['total_lease_in_years'].first()
total_lease = total_lease.reset_index()

In [20]:
total_lease.head()

Unnamed: 0,block,street_name,lease_commence_date,total_lease_in_years
0,1,BEACH RD,1979,99.0
1,1,BEDOK STH AVE 1,1976,98.0
2,1,CHAI CHEE RD,1983,99.0
3,1,CHANGI VILLAGE RD,1980,99.0
4,1,DELTA AVE,1983,99.0


### b. Find the total lease (years) for each flat in approval data dataset

In [21]:
# Convert month to datetime format and extract the month and year into new columns
approval_data = tmp_approval_data.copy(deep=True)
approval_data['month'] = pd.to_datetime(approval_data['month'])
approval_data['mm'] =  approval_data['month'].dt.month
approval_data['yy'] =  approval_data['month'].dt.year

In [22]:
# Get total lease for each flat
approval_data_merge  = approval_data.merge(total_lease, how='left', left_on=['block', 'street_name', 'lease_commence_date'],
                                     right_on=['block', 'street_name', 'lease_commence_date'])

In [23]:
# Check for missing values
approval_data_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 656851 entries, 0 to 656850
Data columns (total 13 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   month                 656851 non-null  datetime64[ns]
 1   town                  656851 non-null  object        
 2   flat_type             656851 non-null  object        
 3   block                 656851 non-null  object        
 4   street_name           656851 non-null  object        
 5   storey_range          656851 non-null  object        
 6   floor_area_sqm        656851 non-null  float64       
 7   flat_model            656851 non-null  object        
 8   lease_commence_date   656851 non-null  int64         
 9   resale_price          656851 non-null  float64       
 10  mm                    656851 non-null  int32         
 11  yy                    656851 non-null  int32         
 12  total_lease_in_years  636704 non-null  float64       
dtyp

In [24]:
# There are missing values in total_lease_in_years col
# Best is to consult the department in charge or data provider for the missing values
# If can't, drop rows with missing values
approval_data = approval_data_merge.dropna(how='any').reset_index(drop=True) #3.06% of the datset has been removed.

### c. Find the remaining lease (years) for each flat in approval data dataset

In [25]:
# Get remaining lease for each flat
approval_data['remaining_lease'] = approval_data['total_lease_in_years'] - approval_data['yy'] + approval_data['lease_commence_date']

In [26]:
approval_data.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,mm,yy,total_lease_in_years,remaining_lease
0,1990-01-01,ANG MO KIO,3 ROOM,216,ANG MO KIO AVE 1,04 TO 06,73.0,NEW GENERATION,1976,47200.0,1,1990,98.0,84.0
1,1990-01-01,ANG MO KIO,3 ROOM,211,ANG MO KIO AVE 3,01 TO 03,67.0,NEW GENERATION,1977,46000.0,1,1990,99.0,86.0
2,1990-01-01,ANG MO KIO,3 ROOM,202,ANG MO KIO AVE 3,07 TO 09,67.0,NEW GENERATION,1977,42000.0,1,1990,99.0,86.0
3,1990-01-01,ANG MO KIO,3 ROOM,235,ANG MO KIO AVE 3,10 TO 12,67.0,NEW GENERATION,1977,38000.0,1,1990,99.0,86.0
4,1990-01-01,ANG MO KIO,3 ROOM,235,ANG MO KIO AVE 3,04 TO 06,67.0,NEW GENERATION,1977,40000.0,1,1990,99.0,86.0


### d. Combine registration date and approval date datsets

In [27]:
# Identify factors that determine the prices of HDB
registration_data = registration_data[['mm', 'yy', 'town', 'flat_type', 'block', 'street_name', 
                                       'storey_range', 'floor_area_sqm', 'flat_model','remaining_lease_years', 'resale_price']]

approval_data = approval_data[['mm', 'yy', 'town', 'flat_type', 'block', 'street_name', 
                                       'storey_range', 'floor_area_sqm', 'flat_model','remaining_lease', 'resale_price']]

# Rename remaining_lease_years col back to remaining_lease to ensure same col name as approval_data
registration_data.rename(columns={"remaining_lease_years":"remaining_lease"}, inplace=True)

In [28]:
# Join registration date and approval date datasets, rename mm as month and yy as year
hdb_data = pd.concat([registration_data, approval_data])
hdb_data.rename(columns={'mm': 'month', 'yy': 'year'}, inplace=True)

In [29]:
hdb_data.head()

Unnamed: 0,month,year,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,remaining_lease,resale_price
0,1,2015,ANG MO KIO,3 ROOM,174,ANG MO KIO AVE 4,07 TO 09,60.0,IMPROVED,70.0,255000.0
1,1,2015,ANG MO KIO,3 ROOM,541,ANG MO KIO AVE 10,01 TO 03,68.0,NEW GENERATION,65.0,275000.0
2,1,2015,ANG MO KIO,3 ROOM,163,ANG MO KIO AVE 4,01 TO 03,69.0,NEW GENERATION,64.0,285000.0
3,1,2015,ANG MO KIO,3 ROOM,446,ANG MO KIO AVE 10,01 TO 03,68.0,NEW GENERATION,63.0,290000.0
4,1,2015,ANG MO KIO,3 ROOM,557,ANG MO KIO AVE 10,07 TO 09,68.0,NEW GENERATION,64.0,290000.0


### e. Dummy Variables for Categorical Variables (town, flat_model)

In [30]:
hdb_data = pd.get_dummies(hdb_data,columns=['town','flat_model'])

In [31]:
hdb_data.head()

Unnamed: 0,month,year,flat_type,block,street_name,storey_range,floor_area_sqm,remaining_lease,resale_price,town_ANG MO KIO,town_BEDOK,town_BISHAN,town_BUKIT BATOK,town_BUKIT MERAH,town_BUKIT PANJANG,town_BUKIT TIMAH,town_CENTRAL AREA,town_CHOA CHU KANG,town_CLEMENTI,town_GEYLANG,town_HOUGANG,town_JURONG EAST,town_JURONG WEST,town_KALLANG/WHAMPOA,town_MARINE PARADE,town_PASIR RIS,town_PUNGGOL,town_QUEENSTOWN,town_SEMBAWANG,town_SENGKANG,town_SERANGOON,town_TAMPINES,town_TOA PAYOH,town_WOODLANDS,town_YISHUN,flat_model_2-ROOM,flat_model_ADJOINED FLAT,flat_model_APARTMENT,flat_model_DBSS,flat_model_IMPROVED,flat_model_IMPROVED-MAISONETTE,flat_model_MAISONETTE,flat_model_MODEL A,flat_model_MODEL A-MAISONETTE,flat_model_MODEL A2,flat_model_MULTI GENERATION,flat_model_NEW GENERATION,flat_model_PREMIUM APARTMENT,flat_model_PREMIUM APARTMENT LOFT,flat_model_PREMIUM MAISONETTE,flat_model_SIMPLIFIED,flat_model_STANDARD,flat_model_TERRACE,flat_model_TYPE S1,flat_model_TYPE S2
0,1,2015,3 ROOM,174,ANG MO KIO AVE 4,07 TO 09,60.0,70.0,255000.0,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,1,2015,3 ROOM,541,ANG MO KIO AVE 10,01 TO 03,68.0,65.0,275000.0,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
2,1,2015,3 ROOM,163,ANG MO KIO AVE 4,01 TO 03,69.0,64.0,285000.0,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
3,1,2015,3 ROOM,446,ANG MO KIO AVE 10,01 TO 03,68.0,63.0,290000.0,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
4,1,2015,3 ROOM,557,ANG MO KIO AVE 10,07 TO 09,68.0,64.0,290000.0,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False


## 6. Train, Test Split

In [32]:
train_data = hdb_data.drop(columns=['resale_price'])
test_data = hdb_data[['resale_price']]

In [33]:
# Split into 80% train and 20% test
x_train, x_test, y_train, y_test = train_test_split(train_data, test_data, test_size=0.2, random_state=42)

## 7. Data Preprocessing for Machine Learning

Label Encoding for ordinal variables (flat_type, storey_range)

In [34]:
# Label encoding for flat_type
type_to_index = {'1 ROOM': 0,'2 ROOM': 1,'3 ROOM': 2,'4 ROOM': 3, '5 ROOM': 4,'MULTI-GENERATION': 5, 'EXECUTIVE': 6}
x_train["flat_type"] = [
    type_to_index[x] for x in list(x_train["flat_type"].values)]

x_test["flat_type"] = [
    type_to_index[x] for x in list(x_test["flat_type"].values)]

In [35]:
# Label encoding for storey_range
storey_range = list(sorted(list(pd.unique(x_train["storey_range"]))))
storey_to_index = dict([
    (storey_range[x], x) for x in range(len(storey_range))])

# Each time is assigned to an index
x_train["storey_range"] = [
    storey_to_index[x] for x in list(x_train["storey_range"].values)]
x_test["storey_range"] = [
    storey_to_index[x] for x in list(x_test["storey_range"].values)]

Label Encoding for Categorical Variable (street_name, block)

Although street_name is not an ordinal variable, we need to use label encoding as it has 547 labels, making it impractical to encode using One-Hot Encoding.

In [36]:
le_street = LabelEncoder()
x_train['street_name'] = le_street.fit_transform(x_train['street_name'])
unseen_labels = [i for i in x_test['street_name'] if i not in le_street.classes_ ]
unseen_indexes = x_test[x_test['street_name'].isin(unseen_labels)].index
x_test = x_test.drop(index = unseen_indexes).reset_index(drop=True)
y_test = y_test.drop(index=unseen_indexes).reset_index(drop=True)
x_test['street_name'] = le_street.transform(x_test['street_name'])

Although block is not an ordinal variable, we need to use label encoding as it has 2446 labels, making it impractical to encode using One-Hot Encoding.

In [37]:
le_block = LabelEncoder()
x_train['block'] = le_block.fit_transform(x_train['block'])
unseen_labels = [i for i in x_test['block'] if i not in le_block.classes_ ]
unseen_indexes = x_test[x_test['block'].isin(unseen_labels)].index
x_test = x_test.drop(index = unseen_indexes).reset_index(drop=True)
y_test = y_test.drop(index=unseen_indexes).reset_index(drop=True)
x_test['block'] = le_block.transform(x_test['block'])

## 8. Feautre Engineering using Random Forest

In [38]:
# Covert to 1D numpy array
y_train = y_train.values.ravel() 
y_test = y_test.values.ravel()

In [None]:
# Random Forest
rf = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [100, 200],            
    'max_depth': [None, 10, 20],            
    'min_samples_split': [2, 5],              
    'min_samples_leaf': [1, 2]}
train_model(rf, x_train, y_train, x_test, y_test)

Root Mean Squared Error (RMSE): 19590.106465800385, R² Score = 0.9811707897376257


Feature importance from Random Forest model

In [40]:
# Visualisee the feature importance
feature_importance_rf = rf.feature_importances_ 
feature_importance_df_rf = pd.DataFrame({ 'Feature': x_train.columns, 
                                      'Importance': feature_importance_rf })
feature_importance_df_rf

Unnamed: 0,Feature,Importance
0,month,0.007820284
1,year,0.3775571
2,flat_type,0.1054666
3,block,0.01197857
4,street_name,0.01527724
5,storey_range,0.02732254
6,floor_area_sqm,0.3502092
7,remaining_lease,0.0158709
8,town_ANG MO KIO,0.001564919
9,town_BEDOK,0.001262219


In [41]:
feature_importance_town = feature_importance_df_rf.iloc[8:34].sum()
feature_importance_flat_model = feature_importance_df_rf.iloc[34:].sum()

print(f"month: {feature_importance_df_rf.iloc[0, 1]}")
print(f"year: {feature_importance_df_rf.iloc[1, 1]}")
print(f"flat_type: {feature_importance_df_rf.iloc[2, 1]}")
print(f"block: {feature_importance_df_rf.iloc[3, 1]}")
print(f"street_name: {feature_importance_df_rf.iloc[4, 1]}")
print(f"storey_range: {feature_importance_df_rf.iloc[5, 1]}")
print(f"floor_area_sqm: {feature_importance_df_rf.iloc[6, 1]}")
print(f"remaining_lease: {feature_importance_df_rf.iloc[7, 1]}")
print(f"floor_area_sqm: {feature_importance_town}")
print(f"remaining_lease: {feature_importance_flat_model}")

month: 0.007820284418133731
year: 0.37755705035850623
flat_type: 0.10546655947217974
block: 0.01197856613308993
street_name: 0.015277242749022722
storey_range: 0.027322538396041575
floor_area_sqm: 0.3502092470608126
remaining_lease: 0.015870895070792847
floor_area_sqm: Feature       town_ANG MO KIOtown_BEDOKtown_BISHANtown_BUKIT...
Importance                                             0.076145
dtype: object
remaining_lease: Feature       flat_model_2-ROOMflat_model_ADJOINED FLATflat_...
Importance                                             0.012353
dtype: object


Top 3 most important factors contributing to resale_price: 
1) year (37.76%)
2) floor_area_sqm (35.02%)
3) flat_type (10.54%)

In [42]:
# Save model
pickle.dump(rf, open('Random Forest Predictive Model.pkl', 'wb'))