# Singapore Flat Resale price Prediction

## Step 1- Reading the data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly.express as px
import plotly.graph_objects as go
import joblib

In [2]:
df = pd.read_csv('ResaleFlatPricesBasedonApprovalDate1990 to 1999_1.csv')
df1 = pd.read_csv('ResaleFlatPricesBasedonApprovalDate2000 to Feb2012_2.csv')
df2 = pd.read_csv('ResaleFlatPricesBasedonRegistrationDateFromMar2012 to Dec2014_3.csv')
df3 = pd.read_csv('ResaleFlatPricesBasedonRegistrationDateFromJan2015 to Dec2016_4.csv')
df4 = pd.read_csv('ResaleflatpricesbasedonregistrationdatefromJan2017onwards_5.csv')

In [3]:
df_combined=pd.concat([df,df1,df2,df3,df4], axis=0, ignore_index= True)

## Step 2 -Data Cleaning

In [4]:
df_combined.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,remaining_lease
0,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,IMPROVED,1977,9000.0,
1,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,04 TO 06,31.0,IMPROVED,1977,6000.0,
2,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,IMPROVED,1977,8000.0,
3,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,07 TO 09,31.0,IMPROVED,1977,6000.0,
4,1990-01,ANG MO KIO,3 ROOM,216,ANG MO KIO AVE 1,04 TO 06,73.0,NEW GENERATION,1976,47200.0,


In [5]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 935645 entries, 0 to 935644
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   month                935645 non-null  object 
 1   town                 935645 non-null  object 
 2   flat_type            935645 non-null  object 
 3   block                935645 non-null  object 
 4   street_name          935645 non-null  object 
 5   storey_range         935645 non-null  object 
 6   floor_area_sqm       935645 non-null  float64
 7   flat_model           935645 non-null  object 
 8   lease_commence_date  935645 non-null  int64  
 9   resale_price         935645 non-null  float64
 10  remaining_lease      226595 non-null  object 
dtypes: float64(2), int64(1), object(8)
memory usage: 78.5+ MB


In [6]:
df_combined.isnull().sum()

month                       0
town                        0
flat_type                   0
block                       0
street_name                 0
storey_range                0
floor_area_sqm              0
flat_model                  0
lease_commence_date         0
resale_price                0
remaining_lease        709050
dtype: int64

In [7]:
df_combined['month']= pd.to_datetime(df_combined['month'],format='%Y-%m')

In [8]:
df_combined['remaining_lease'] = 99-(df_combined['month'].dt.year - df_combined['lease_commence_date'])

In [9]:
df_combined.describe()

Unnamed: 0,month,floor_area_sqm,lease_commence_date,resale_price,remaining_lease
count,935645,935645.0,935645.0,935645.0,935645.0
mean,2006-10-16 21:45:25.552960768,95.688309,1988.364984,323465.2,81.034746
min,1990-01-01 00:00:00,28.0,1966.0,5000.0,41.0
25%,1999-01-01 00:00:00,73.0,1981.0,195000.0,74.0
50%,2005-06-01 00:00:00,93.0,1986.0,300000.0,83.0
75%,2014-07-01 00:00:00,113.0,1996.0,420000.0,90.0
max,2024-09-01 00:00:00,366.7,2020.0,1588000.0,101.0
std,,25.811325,10.752315,172982.4,10.789665


In [10]:
#remaining lease cannot be more than 99
df1_combined = df_combined[df_combined['remaining_lease'] <= 99].reset_index(drop=True)

In [11]:
df1_combined.describe()

Unnamed: 0,month,floor_area_sqm,lease_commence_date,resale_price,remaining_lease
count,935594,935594.0,935594.0,935594.0,935594.0
mean,2006-10-17 01:54:09.524900608,95.686698,1988.364424,323463.3,81.033708
min,1990-01-01 00:00:00,28.0,1966.0,5000.0,41.0
25%,1999-01-01 00:00:00,73.0,1981.0,195000.0,74.0
50%,2005-06-01 00:00:00,93.0,1986.0,300000.0,83.0
75%,2014-07-01 00:00:00,113.0,1996.0,420000.0,90.0
max,2024-09-01 00:00:00,366.7,2020.0,1588000.0,99.0
std,,25.810666,10.752274,172985.0,10.789042


In [12]:
df1_combined.shape

(935594, 11)

In [13]:
df1_combined['mid_storey']= (df1_combined['storey_range'].str.split('TO').str[0].astype(int) + 1)
df1_combined.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,remaining_lease,mid_storey
0,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,IMPROVED,1977,9000.0,86,11
1,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,04 TO 06,31.0,IMPROVED,1977,6000.0,86,5
2,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,IMPROVED,1977,8000.0,86,11
3,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,07 TO 09,31.0,IMPROVED,1977,6000.0,86,8
4,1990-01-01,ANG MO KIO,3 ROOM,216,ANG MO KIO AVE 1,04 TO 06,73.0,NEW GENERATION,1976,47200.0,85,5


In [14]:
df1_combined.columns

Index(['month', 'town', 'flat_type', 'block', 'street_name', 'storey_range',
       'floor_area_sqm', 'flat_model', 'lease_commence_date', 'resale_price',
       'remaining_lease', 'mid_storey'],
      dtype='object')

In [15]:
train_df = df1_combined.iloc[:int(df1_combined.shape[0] * 0.9)]
train_df.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,remaining_lease,mid_storey
0,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,IMPROVED,1977,9000.0,86,11
1,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,04 TO 06,31.0,IMPROVED,1977,6000.0,86,5
2,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,IMPROVED,1977,8000.0,86,11
3,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,07 TO 09,31.0,IMPROVED,1977,6000.0,86,8
4,1990-01-01,ANG MO KIO,3 ROOM,216,ANG MO KIO AVE 1,04 TO 06,73.0,NEW GENERATION,1976,47200.0,85,5


## Step 3 - EDA

In [16]:
train_df['year']=train_df['month'].dt.year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['year']=train_df['month'].dt.year


In [17]:
categorical_columns = train_df.select_dtypes(include ='object').columns
numerical_columns = train_df.select_dtypes(include=np.number).columns

In [18]:
categorical_columns

Index(['town', 'flat_type', 'block', 'street_name', 'storey_range',
       'flat_model'],
      dtype='object')

In [19]:
numerical_columns=[ 'year','floor_area_sqm','remaining_lease', 'mid_storey','resale_price']

In [20]:
numerical_columns_1=['year','floor_area_sqm','remaining_lease','mid_storey']

In [None]:
for i in numerical_columns:
    plt.figure(figsize=(12, 6))
    # Plot before log transformation
    sns.histplot(data=train_df,x=i,bins=50,kde=True)
    plt.title(f'{i}')

In [21]:
train_df.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,remaining_lease,mid_storey,year
0,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,IMPROVED,1977,9000.0,86,11,1990
1,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,04 TO 06,31.0,IMPROVED,1977,6000.0,86,5,1990
2,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,IMPROVED,1977,8000.0,86,11,1990
3,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,07 TO 09,31.0,IMPROVED,1977,6000.0,86,8,1990
4,1990-01-01,ANG MO KIO,3 ROOM,216,ANG MO KIO AVE 1,04 TO 06,73.0,NEW GENERATION,1976,47200.0,85,5,1990


In [None]:
for i in numerical_columns_1:
    fig=px.scatter(train_df, x=i, y='resale_price', trendline='ols', trendline_color_override= 'red')
    fig.show()


In [None]:
sns.lmplot(data=train_df, x= 'year', y = 'resale_price', hue='flat_type')
plt.show()

In [None]:
sns.lmplot(data=train_df, x= 'year', y = 'resale_price', hue='town')
plt.show()

In [22]:
df1=train_df.copy()

In [23]:
df1['remaining_lease_log']=np.log(df1['remaining_lease'])
df1['floor_area_sqm_log']=np.log(df1['floor_area_sqm'])
df1['mid_storey_log']=np.log(df1['mid_storey'])
df1['year_log']=np.log(df1['year'])
df1['resale_price_log']=np.log(df1['resale_price'])

In [24]:
numerical_columns_logged=['year_log','floor_area_sqm_log','remaining_lease_log','mid_storey_log','resale_price_log']

In [25]:
numerical_columns_logged_1=['year_log','floor_area_sqm_log','remaining_lease_log','mid_storey_log']

In [None]:
for i,j in zip(numerical_columns,numerical_columns_logged):
    plt.figure(figsize=(12, 6))
    # Plot before log transformation
    plt.subplot(1, 2, 1)
    sns.histplot(data=df1,x=i,bins=50,kde=True)
    plt.title(f'before log {i}')

    # Plot after log transformation
    plt.subplot(1, 2, 2)
    sns.histplot(data=df1,x=j,bins=50,kde=True)
    plt.title(f'after log-Transformed {j}')
    # Display the plots
    plt.tight_layout()
    plt.show()

# Step 4 - Feature Engineering and Feature selection

# Feature selection

In [26]:
df1.columns

Index(['month', 'town', 'flat_type', 'block', 'street_name', 'storey_range',
       'floor_area_sqm', 'flat_model', 'lease_commence_date', 'resale_price',
       'remaining_lease', 'mid_storey', 'year', 'remaining_lease_log',
       'floor_area_sqm_log', 'mid_storey_log', 'year_log', 'resale_price_log'],
      dtype='object')

Features like month, block, street_name, storey_range,lease_commence_Date, are not so important for resale flat price predicton and so they are dropped

Features like floor_area_sqm, remaining_lease, mid_storey are dropped because we dont need those as we have logged values of those features

In [27]:
selected_features=['town','flat_type','year_log','remaining_lease_log','floor_area_sqm_log', 'mid_storey_log']

# Feature Engineering

In [28]:
df2 = df1[['town', 'flat_type','year_log','remaining_lease_log','floor_area_sqm_log', 'mid_storey_log','resale_price_log']]

In [29]:
numerical_columns_1

['year', 'floor_area_sqm', 'remaining_lease', 'mid_storey']

In [30]:
numerical_columns_logged_1

['year_log', 'floor_area_sqm_log', 'remaining_lease_log', 'mid_storey_log']

## Step 5 - Handling outliers

In [None]:
#Detecting Outliers before and after log transformation
for i,j in zip(numerical_columns_1,numerical_columns_logged_1):
        plt.figure(figsize=(10, 6))
    # Plot before log transformation
        plt.subplot(1, 2, 1)
        sns.boxplot(data=train_df, x=i)
        plt.title(f"Boxplot to Detect Outliers before log of {i}")
        
        
        plt.subplot(1, 2, 2)
        sns.boxplot(data=df2, x=j)
        plt.title(f"Boxplot to Detect Outliers after log of {j}")
        plt.tight_layout()
        plt.show()

In [31]:
#There are some outliers in floor_area_sqm_log, mid_storey_log,remaining_lease_log

In [32]:
def outlier(df, column):
    iqr = df[column].quantile(0.75) - df[column].quantile(0.25)
    upper = df[column].quantile(0.75) + (1.5*iqr)
    lower = df[column].quantile(0.25) - (1.5*iqr)
    df_filtered = df[(df[column] >= lower) & (df[column] <= upper)]
    return df_filtered

In [33]:
df3 = outlier(df2,'floor_area_sqm_log').reset_index(drop=True)
df3

Unnamed: 0,town,flat_type,year_log,remaining_lease_log,floor_area_sqm_log,mid_storey_log,resale_price_log
0,ANG MO KIO,3 ROOM,7.595890,4.442651,4.290459,1.609438,10.762149
1,ANG MO KIO,3 ROOM,7.595890,4.454347,4.204693,0.693147,10.736397
2,ANG MO KIO,3 ROOM,7.595890,4.454347,4.204693,2.079442,10.645425
3,ANG MO KIO,3 ROOM,7.595890,4.454347,4.204693,2.397895,10.545341
4,ANG MO KIO,3 ROOM,7.595890,4.454347,4.204693,1.609438,10.596635
...,...,...,...,...,...,...,...
840588,PASIR RIS,4 ROOM,7.611348,4.532599,4.454347,2.639057,13.384728
840589,PASIR RIS,EXECUTIVE,7.611348,4.189655,4.983607,1.609438,13.384728
840590,PASIR RIS,EXECUTIVE,7.611348,4.304065,4.962845,0.693147,13.384728
840591,PASIR RIS,EXECUTIVE,7.611348,4.304065,4.969813,0.693147,13.392391


In [34]:
df4=outlier(df3,'mid_storey_log').reset_index(drop=True)
df4

Unnamed: 0,town,flat_type,year_log,remaining_lease_log,floor_area_sqm_log,mid_storey_log,resale_price_log
0,ANG MO KIO,3 ROOM,7.595890,4.442651,4.290459,1.609438,10.762149
1,ANG MO KIO,3 ROOM,7.595890,4.454347,4.204693,0.693147,10.736397
2,ANG MO KIO,3 ROOM,7.595890,4.454347,4.204693,2.079442,10.645425
3,ANG MO KIO,3 ROOM,7.595890,4.454347,4.204693,2.397895,10.545341
4,ANG MO KIO,3 ROOM,7.595890,4.454347,4.204693,1.609438,10.596635
...,...,...,...,...,...,...,...
840062,PASIR RIS,4 ROOM,7.611348,4.532599,4.454347,2.639057,13.384728
840063,PASIR RIS,EXECUTIVE,7.611348,4.189655,4.983607,1.609438,13.384728
840064,PASIR RIS,EXECUTIVE,7.611348,4.304065,4.962845,0.693147,13.384728
840065,PASIR RIS,EXECUTIVE,7.611348,4.304065,4.969813,0.693147,13.392391


In [35]:
df5=outlier(df4,'remaining_lease_log').reset_index(drop=True)
df5

Unnamed: 0,town,flat_type,year_log,remaining_lease_log,floor_area_sqm_log,mid_storey_log,resale_price_log
0,ANG MO KIO,3 ROOM,7.595890,4.442651,4.290459,1.609438,10.762149
1,ANG MO KIO,3 ROOM,7.595890,4.454347,4.204693,0.693147,10.736397
2,ANG MO KIO,3 ROOM,7.595890,4.454347,4.204693,2.079442,10.645425
3,ANG MO KIO,3 ROOM,7.595890,4.454347,4.204693,2.397895,10.545341
4,ANG MO KIO,3 ROOM,7.595890,4.454347,4.204693,1.609438,10.596635
...,...,...,...,...,...,...,...
826925,PASIR RIS,4 ROOM,7.611348,4.532599,4.454347,2.639057,13.384728
826926,PASIR RIS,EXECUTIVE,7.611348,4.189655,4.983607,1.609438,13.384728
826927,PASIR RIS,EXECUTIVE,7.611348,4.304065,4.962845,0.693147,13.384728
826928,PASIR RIS,EXECUTIVE,7.611348,4.304065,4.969813,0.693147,13.392391


In [None]:
for i in numerical_columns_logged_1:
        plt.figure(figsize=(10, 6))
        plt.subplot(1, 2, 1)
        sns.boxplot(data=df2, x=i)
        plt.title(f"Boxplot to Detect Outliers before Outlier removal of {i}")

        plt.subplot(1, 2, 2)
        sns.boxplot(data=df5, x=i)
        plt.title(f"Boxplot to Detect Outliers after Outlier_removal of {i}")
        plt.tight_layout()
        plt.show()

In [36]:
df5.columns

Index(['town', 'flat_type', 'year_log', 'remaining_lease_log',
       'floor_area_sqm_log', 'mid_storey_log', 'resale_price_log'],
      dtype='object')

## Step 6 - Encoding

In [37]:
df6=df5.copy()

In [38]:
df6['flat_type']=df6['flat_type'].str.replace('-', ' ')

In [39]:
df6['flat_type'].unique()

array(['3 ROOM', '4 ROOM', '5 ROOM', '2 ROOM', 'EXECUTIVE',
       'MULTI GENERATION'], dtype=object)

In [40]:
flat_type_map={'1 ROOM' :1 , '2 ROOM': 2, '3 ROOM': 3, '4 ROOM': 4, '5 ROOM': 5,  'EXECUTIVE':6,
       'MULTI GENERATION': 7}

df6['flat_type_encoded']=df6['flat_type'].map(flat_type_map)

In [41]:
town_encoded={'ANG MO KIO':1, 'BEDOK':2, 'BISHAN':3, 'BUKIT BATOK':4, 'BUKIT MERAH':5,
       'BUKIT TIMAH':6, 'CENTRAL AREA':7, 'CLEMENTI':8, 'GEYLANG':9, 'HOUGANG':10,
       'JURONG EAST':11, 'JURONG WEST':12, 'KALLANG/WHAMPOA':13, 'MARINE PARADE':14,
       'QUEENSTOWN':15, 'SERANGOON':16, 'TAMPINES':17, 'TOA PAYOH':18, 'WOODLANDS':19,
       'YISHUN':20, 'CHOA CHU KANG':21, 'BUKIT PANJANG':22, 'PASIR RIS':23,
       'SENGKANG':24, 'SEMBAWANG':25, 'LIM CHU KANG':26, 'PUNGGOL':27}
df6['town_encoded']=df6['town'].map(town_encoded)

In [42]:
df7=df6[['town_encoded','flat_type_encoded','year_log','floor_area_sqm_log', 'remaining_lease_log','mid_storey_log']]

## Step 7 - Feature Scaling

In [43]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler_target=StandardScaler()
df8_scaled=scaler.fit_transform(df7)
df8_target_scaled=scaler_target.fit_transform(df6[['resale_price_log']])
df8_scaled

array([[-1.64990051, -1.08414769, -1.78704162, -0.8749444 ,  0.33533594,
        -0.30093987],
       [-1.64990051, -1.08414769, -1.78704162, -1.18945263,  0.43345038,
        -1.64620222],
       [-1.64990051, -1.08414769, -1.78704162, -1.18945263,  0.43345038,
         0.38910115],
       ...,
       [ 1.31831961,  2.09647342,  2.06565408,  1.59070225, -0.82722029,
        -1.64620222],
       [ 1.31831961,  2.09647342,  2.06565408,  1.61625646, -0.82722029,
        -1.64620222],
       [ 1.31831961,  2.09647342,  2.06565408,  1.6918677 , -1.78697314,
        -1.64620222]])

In [44]:
df8_scaled=pd.DataFrame(df8_scaled, columns=['town_encoded','flat_type_encoded','year_log','floor_area_sqm_log', 'remaining_lease_log','mid_storey_log'])
df8_scaled.head()

Unnamed: 0,town_encoded,flat_type_encoded,year_log,floor_area_sqm_log,remaining_lease_log,mid_storey_log
0,-1.649901,-1.084148,-1.787042,-0.874944,0.335336,-0.30094
1,-1.649901,-1.084148,-1.787042,-1.189453,0.43345,-1.646202
2,-1.649901,-1.084148,-1.787042,-1.189453,0.43345,0.389101
3,-1.649901,-1.084148,-1.787042,-1.189453,0.43345,0.856643
4,-1.649901,-1.084148,-1.787042,-1.189453,0.43345,-0.30094


In [45]:
df8_scaled_target=pd.DataFrame(df8_target_scaled, columns=['resale_price_log'])
df8_scaled_target.head()

Unnamed: 0,resale_price_log
0,-2.849061
1,-2.892516
2,-3.046021
3,-3.214901
4,-3.128349


In [46]:
df8_scaled_1=pd.concat([df8_scaled,df8_scaled_target],axis=1)

In [47]:
selected_features_AS = ['town_encoded','flat_type_encoded','year_log','floor_area_sqm_log', 'remaining_lease_log','mid_storey_log']

## Step 8 - Model Selection

In [48]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

In [49]:
# Randomly sample 20% of your dataset
df_sample = df8_scaled_1.sample(frac=0.2, random_state=42)
X_sample = df_sample.drop('resale_price_log',axis=1)
y_sample = df_sample['resale_price_log']

In [50]:
df_sample.isnull().sum()

town_encoded           0
flat_type_encoded      0
year_log               0
floor_area_sqm_log     0
remaining_lease_log    0
mid_storey_log         0
resale_price_log       0
dtype: int64

In [51]:
from sklearn.model_selection import train_test_split
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

In [52]:
X_train.isnull().sum()

town_encoded           0
flat_type_encoded      0
year_log               0
floor_area_sqm_log     0
remaining_lease_log    0
mid_storey_log         0
dtype: int64

In [53]:
def model_fit_evaluation_regr(models_reg, x_train, x_test, y_train, y_test, metrics_reg):
    results_reg = {}
    
    for model_name_reg, model_reg in models_reg.items():
        regression = model_reg
        regression.fit(x_train, y_train)
        y_prediction = regression.predict(x_test)
        results_reg[model_name_reg] = {}
        for metric_name_reg, metric_func_reg in metrics_reg.items():
            results_reg[model_name_reg][metric_name_reg] = metric_func_reg(y_test, y_prediction)
    return results_reg

models_reg = {
    'Linear Regression': LinearRegression(),
    'Ridge':Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=0.1),
    'ElasticNet': ElasticNet(alpha=0.1, l1_ratio=0.5)}

metrics_reg = {'MSE' : mean_squared_error,
               'MAE': mean_absolute_error,
                'R2_Score' : r2_score} 

In [None]:
model_fit_evaluation_regr(models_reg, X_train, X_test, y_train, y_test, metrics_reg)

In [54]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [55]:
def model_fit_evaluation_Non_lin_regr(models_reg_Non_lin, x_train, x_test, y_train, y_test, metrics_reg_Non_lin):
    results_reg = {}
    
    for model_name_reg, models_reg_Non_lin in models_reg_Non_lin.items():
        regression = models_reg_Non_lin()
        regression.fit(x_train, y_train)
        y_prediction_reg = regression.predict(x_test)
        results_reg[model_name_reg] = {}
        for metric_name_reg, metric_func_reg in metrics_reg_Non_lin.items():
            results_reg[model_name_reg][metric_name_reg] = metric_func_reg(y_test, y_prediction_reg)
    return results_reg

models_reg_Non_lin = {
    'Gradient Boosting Regression':GradientBoostingRegressor,
    'Random Forest Regression': RandomForestRegressor,
    'Decision Tree Regression': DecisionTreeRegressor}

metrics_reg_Non_lin = {'MSE' : mean_squared_error,
               'MAE': mean_absolute_error,
                'R2_Score' : r2_score} 

In [None]:
model_fit_evaluation_Non_lin_regr(models_reg_Non_lin, X_train, X_test, y_train, y_test, metrics_reg_Non_lin)

# Random Forest Regressor is selected as it has higher R_Square

## Step 9 - Model Training 

In [56]:
df8_scaled_1=df8_scaled_1[:1000]

In [57]:
df8_scaled_1.head()

Unnamed: 0,town_encoded,flat_type_encoded,year_log,floor_area_sqm_log,remaining_lease_log,mid_storey_log,resale_price_log
0,-1.649901,-1.084148,-1.787042,-0.874944,0.335336,-0.30094,-2.849061
1,-1.649901,-1.084148,-1.787042,-1.189453,0.43345,-1.646202,-2.892516
2,-1.649901,-1.084148,-1.787042,-1.189453,0.43345,0.389101,-3.046021
3,-1.649901,-1.084148,-1.787042,-1.189453,0.43345,0.856643,-3.214901
4,-1.649901,-1.084148,-1.787042,-1.189453,0.43345,-0.30094,-3.128349


In [58]:
df8_scaled_1.tail()

Unnamed: 0,town_encoded,flat_type_encoded,year_log,floor_area_sqm_log,remaining_lease_log,mid_storey_log,resale_price_log
995,-0.030871,-1.084148,-1.787042,-1.135125,0.626302,-0.30094,-2.416278
996,-0.030871,-1.084148,-1.787042,-1.135125,0.626302,1.210707,-2.416278
997,-0.030871,-1.084148,-1.787042,-1.135125,0.626302,-0.30094,-2.685637
998,-0.030871,-1.084148,-1.787042,-1.135125,0.907514,-1.646202,-2.501375
999,-0.030871,-1.084148,-1.787042,0.387497,1.089884,-0.30094,-2.023231


In [59]:
x_model = df8_scaled_1[['town_encoded','flat_type_encoded','year_log','floor_area_sqm_log', 'remaining_lease_log', 'mid_storey_log']]  
y_model = df8_scaled_1[['resale_price_log']]
x_train_model, x_test_model, y_train_model, y_test_model = train_test_split(x_model, y_model, test_size=0.2, random_state=42)

In [62]:
selected_model = RandomForestRegressor()

# Train the model (fit to training data)
selected_model.fit(x_train_model, y_train_model)


  return fit_method(estimator, *args, **kwargs)


## Step 10 - Model Export

In [63]:
import joblib

In [64]:
model_file_name=r"D:\GUVI_projects\model_RF_deploy.joblib"

In [65]:
saved_data = {'model': selected_model,'scaler':scaler,'scaler_target':scaler_target}

In [66]:
joblib.dump(saved_data, model_file_name)

['D:\\GUVI_projects\\model_RF_deploy.joblib']