In [2]:
# We will import the relevant files
import csv
import numpy as np

# read_csv function
def read_csv(csvfilename):
    rows = []
    with open(csvfilename) as csvfile:
        file_reader = csv.reader(csvfile)
        for row in file_reader:
            rows.append(row)
    # convert the list into Numpy array
    rows = np.array(rows)
    return rows

firstData = read_csv("resale-flat-prices-based-on-registration-date-from-jan-2015-to-dec-2016.csv")
secondData = read_csv("resale-flat-prices-based-on-registration-date-from-jan-2017-onwards.csv")

In [3]:
firstData

array([['month', 'town', 'flat_type', ..., 'lease_commence_date',
        'remaining_lease', 'resale_price'],
       ['2015-01', 'ANG MO KIO', '3 ROOM', ..., '1986', '70', '255000'],
       ['2015-01', 'ANG MO KIO', '3 ROOM', ..., '1981', '65', '275000'],
       ...,
       ['2016-12', 'YISHUN', 'EXECUTIVE', ..., '1992', '74', '778000'],
       ['2016-12', 'YISHUN', 'EXECUTIVE', ..., '1988', '70', '575000'],
       ['2016-12', 'YISHUN', 'MULTI-GENERATION', ..., '1987', '70',
        '735000']], dtype='<U22')

In [4]:
secondData

array([['month', 'town', 'flat_type', ..., 'lease_commence_date',
        'remaining_lease', 'resale_price'],
       ['2017-03', 'BUKIT MERAH', '1 ROOM', ..., '1975', '57 years',
        '200000'],
       ['2017-03', 'BUKIT MERAH', '1 ROOM', ..., '1975', '57 years',
        '218000'],
       ...,
       ['2021-11', 'YISHUN', 'MULTI-GENERATION', ..., '1987',
        '65 years 01 month', '838000'],
       ['2021-03', 'YISHUN', 'MULTI-GENERATION', ..., '1987',
        '65 years 10 months', '860000'],
       ['2021-10', 'YISHUN', 'MULTI-GENERATION', ..., '1987',
        '65 years 01 month', '760000']], dtype='<U22')

In [5]:
# We will now join the csv together from 2015 to 2021
data = np.concatenate((firstData,secondData[1:]))
data

array([['month', 'town', 'flat_type', ..., 'lease_commence_date',
        'remaining_lease', 'resale_price'],
       ['2015-01', 'ANG MO KIO', '3 ROOM', ..., '1986', '70', '255000'],
       ['2015-01', 'ANG MO KIO', '3 ROOM', ..., '1981', '65', '275000'],
       ...,
       ['2021-11', 'YISHUN', 'MULTI-GENERATION', ..., '1987',
        '65 years 01 month', '838000'],
       ['2021-03', 'YISHUN', 'MULTI-GENERATION', ..., '1987',
        '65 years 10 months', '860000'],
       ['2021-10', 'YISHUN', 'MULTI-GENERATION', ..., '1987',
        '65 years 01 month', '760000']], dtype='<U22')

In [6]:
# 'MULTI-GENERATION' and 'EXECUTIVE' do not have a fixed number of rooms.
# We will exclude these entries from the data.

def removeUnlabelledData(data):
    data1 = []
    for i in range(1,len(data)):
        row = data[i]
        # row with executive and multi-generation
        if row[2] == 'EXECUTIVE' or row[2] == 'MULTI-GENERATION':
            continue;
        else:
            data1.append(row)
    
    # convert the list to numpy array
    data1 = np.array(data1)
    return data1

data1 = removeUnlabelledData(data)
data1

array([['2015-01', 'ANG MO KIO', '3 ROOM', ..., '1986', '70', '255000'],
       ['2015-01', 'ANG MO KIO', '3 ROOM', ..., '1981', '65', '275000'],
       ['2015-01', 'ANG MO KIO', '3 ROOM', ..., '1980', '64', '285000'],
       ...,
       ['2021-11', 'YISHUN', '5 ROOM', ..., '1988', '65 years 04 months',
        '580000'],
       ['2021-11', 'YISHUN', '5 ROOM', ..., '1988', '66 years 01 month',
        '645000'],
       ['2021-12', 'YISHUN', '5 ROOM', ..., '1987', '64 years 11 months',
        '658000']], dtype='<U22')

In [7]:
# We will retrieve the MRT Station from the MRT CSV file
mrtData = read_csv("MRTstations.csv")
mrtData

array([['ï»¿Station Code', 'MRT Station Name', 'Line Color', 'Line Name'],
       ['NS1', 'Jurong East', 'Red', 'North South'],
       ['NS2', 'Bukit Batok', 'Red', 'North South'],
       ['NS3', 'Bukit Gombak', 'Red', 'North South'],
       ['NS4 BP1', 'Choa Chu Kang', 'Red', 'North South'],
       ['NS5', 'Yew Tee', 'Red', 'North South'],
       ['NS7', 'Kranji', 'Red', 'North South'],
       ['NS', 'Marsiling', 'Red', 'North South'],
       ['NS9 TE2', 'Woodlands', 'Red', 'North South'],
       ['NS10', 'Admiralty', 'Red', 'North South'],
       ['NS11', 'Sembawang', 'Red', 'North South'],
       ['NS12', 'Canberra', 'Red', 'North South'],
       ['NS13', 'Yishun', 'Red', 'North South'],
       ['NS14', 'Khatib', 'Red', 'North South'],
       ['NS15', 'Yio Chu Kang', 'Red', 'North South'],
       ['NS16', 'Ang Mo Kio', 'Red', 'North South'],
       ['NS17 CC15', 'Bishan', 'Red', 'North South'],
       ['NS18', 'Braddell', 'Red', 'North South'],
       ['NS19', 'Toa Payoh', 'Red', 'N

In [8]:
# library needed for the operation to happen
import json
import requests

# We will create a function to return dictionary of MRT stations with their longitudes and lattitudes
def createStationLocation(data):
    # create dictionary
    mrtStationCoordinates = {}
    # iterate through the stations in mrtData
    for i in range(1,len(data)):
        row = data[i]
        # LRT Station
        if (row[2] == 'SK' or row[2] == 'Pu' or row[2] =='BP'):
            stationName = row[1] + ' LRT STATION'
        # MRT Station
        else:
            stationName = row[1] + ' Mrt station'
        # print(stationName)
        # we will use the Station code to query the longitude and lattitude from the onemap api provided by SLA
        link = "https://developers.onemap.sg/commonapi/search?searchVal=%s&returnGeom=Y&getAddrDetails=Y" % stationName
        # print(link)
        resp = requests.get(link)
        
        # print(json.loads(resp.content))
        # we will narrow down to the the result section from the dictionary
        query = json.loads(resp.content)['results'][0]
        # print(query)
        
        # initialise the nested dictionary
        mrtStationCoordinates[row[1]] = {}
        
        # add the lattitude and convert to float value
        mrtStationCoordinates[row[1]]['Latitude'] = float(query['LATITUDE'])
        
        # add the longitude and convert to float value
        mrtStationCoordinates[row[1]]['Longtitude'] = float(query['LONGTITUDE'])
    
    return mrtStationCoordinates
        
mrtStationCoordinates = createStationLocation(mrtData)
mrtStationCoordinates

{'Jurong East': {'Latitude': 1.33315281585758, 'Longtitude': 103.742286332403},
 'Bukit Batok': {'Latitude': 1.34903331201636, 'Longtitude': 103.749566478309},
 'Bukit Gombak': {'Latitude': 1.35861159094192,
  'Longtitude': 103.751790910733},
 'Choa Chu Kang': {'Latitude': 1.38536316540225,
  'Longtitude': 103.744370779756},
 'Yew Tee': {'Latitude': 1.39753506936297, 'Longtitude': 103.747405150236},
 'Kranji': {'Latitude': 1.42508698073648, 'Longtitude': 103.762137459497},
 'Marsiling': {'Latitude': 1.43252114855026, 'Longtitude': 103.774074641403},
 'Woodlands': {'Latitude': 1.43605761708128, 'Longtitude': 103.787938777173},
 'Admiralty': {'Latitude': 1.44058856161847, 'Longtitude': 103.800990519771},
 'Sembawang': {'Latitude': 1.44905082158502, 'Longtitude': 103.820046140211},
 'Canberra': {'Latitude': 1.44307664075699, 'Longtitude': 103.829702590959},
 'Yishun': {'Latitude': 1.42944308477331, 'Longtitude': 103.835005047246},
 'Khatib': {'Latitude': 1.41738337009565, 'Longtitude': 10

In [9]:
# function to create dictionary of street name mapping to the their nearest mrt station
def createStreetNameLocation(data,mrtStationCoordinates):
    # intialise the dictionary
    streetNameLocation = {}
    
    # We will iterate the data
    for i in range(1,len(data)):
        row = data[i]
        streetName = row[4]
        # check if the street name is in the streetNameLocation
        if row[4] in streetNameLocation:
            # move on to the next row
            continue
        else:
            # ST George road
            if row[4] == "ST. GEORGE'S RD":
                # We will use this bus code instead
                postalCode = 60241
                link = "https://developers.onemap.sg/commonapi/search?searchVal=%s&returnGeom=Y&getAddrDetails=Y" % postalCode
            else:
                # We use onemap api to find the streetname coordinates
                link = "https://developers.onemap.sg/commonapi/search?searchVal=%s&returnGeom=Y&getAddrDetails=Y" % streetName            
            
            resp = requests.get(link)
                        
            # we will narrow down to the the result section from the dictionary
            query = json.loads(resp.content)['results'][0]
            
            # retrieve the list from this nearest MRT Station
            information = findNearestMRTStation(float(query['LATITUDE']),float(query['LONGTITUDE']),mrtStationCoordinates)
            
            # create a nested dictionary
            streetNameLocation[row[4]] = {}
            
            # add the nearest MRT station into the dictionary -- this is for debugging if any
            streetNameLocation[row[4]]["Nearest MRT"] = information[0]
            
            # add the nearest distance into the dictionary -- this will be used later
            streetNameLocation[row[4]]["Nearest Distance To MRT/LRT"] = information[1]            
            
            # add the nearest distance to CDB into the dictionary -- this will be used later
            streetNameLocation[row[4]]["Nearest Distance to CBD"] = information[2]
    
    return streetNameLocation           
        
    
# function that return a list that contains the name of nearest Mrt Station, 
# distance to nearest mrt station and the distance to cbd.
def findNearestMRTStation(latitude,longitude,mrtStationCoordinates):
    nearestMRT = ''
    nearestDistanceToMRT = 10000000
    nearestDistanceToCBD = 0
    information = [] 
    
    temp = 0   
    # we will iterate through the mrtStationCoordinates
    for key in mrtStationCoordinates:
        stationLatitude = mrtStationCoordinates[key]['Latitude']
        stationLongitude = mrtStationCoordinates[key]['Longtitude']
        temp = distanceCalculator(latitude,longitude,stationLatitude,stationLongitude)
        if temp < nearestDistanceToMRT:
            # reupdate the nearestDistanceToMRT
            nearestDistanceToMRT = temp
            # reupdate the stationame
            nearestMRT = key 
          
    # find the nearestDistanceToCBD
    nearestDistanceToCBD = distanceCalculator(latitude,longitude,
                                              mrtStationCoordinates['Raffles Place']['Latitude'],
                                              mrtStationCoordinates['Raffles Place']['Longtitude'])
    
    information.append(nearestMRT)
    information.append(nearestDistanceToMRT)
    information.append(nearestDistanceToCBD)
    
    # return the list
    return information    

import math
from math import radians,cos,sin,asin
# function that return the distance between 2 coordinates
def distanceCalculator(latitude1, longitude1,latitude2, longitude2):
    deltaLon = (longitude2- longitude1) * (math.pi/180)
    deltaLat = (latitude2 - latitude1) * (math.pi/180)
    a = (math.sin(deltaLat/2))**2 + cos(latitude1) * cos(latitude2)* ((sin(deltaLon/2))**2)
    
    c = 2 * math.asin(min(1,math.sqrt(a)))
    
    earthRadius = 6371 * 1000
    
    # find the distance 
    distance = earthRadius * c
    
    return distance


streetNameLocation = createStreetNameLocation(data1,mrtStationCoordinates)
streetNameLocation

{'ANG MO KIO AVE 10': {'Nearest MRT': 'Kovan',
  'Nearest Distance To MRT/LRT': 741.1753675994183,
  'Nearest Distance to CBD': 8753.962810956791},
 'ANG MO KIO AVE 4': {'Nearest MRT': 'Yio Chu Kang',
  'Nearest Distance To MRT/LRT': 208.9659688467233,
  'Nearest Distance to CBD': 10685.210311595196},
 'ANG MO KIO AVE 5': {'Nearest MRT': 'Yio Chu Kang',
  'Nearest Distance To MRT/LRT': 593.3747663475615,
  'Nearest Distance to CBD': 10385.822676277428},
 'ANG MO KIO AVE 8': {'Nearest MRT': 'Kovan',
  'Nearest Distance To MRT/LRT': 797.5765226095261,
  'Nearest Distance to CBD': 8744.371957533542},
 'ANG MO KIO AVE 1': {'Nearest MRT': 'Ang Mo Kio',
  'Nearest Distance To MRT/LRT': 624.538784896674,
  'Nearest Distance to CBD': 8944.505286361644},
 'ANG MO KIO AVE 3': {'Nearest MRT': 'Ang Mo Kio',
  'Nearest Distance To MRT/LRT': 335.4962034709635,
  'Nearest Distance to CBD': 9666.557754014804},
 'ANG MO KIO AVE 6': {'Nearest MRT': 'Ang Mo Kio',
  'Nearest Distance To MRT/LRT': 265.1092

In [10]:
# We will now create getter functions to return the remaining data
def retrieveYear(entry):
    year = float(entry[:4])
    months = float(entry[5:7])/13
    time = year + months
    return time

def retrieveRoom(entry):
    return float(entry[0])

def retrieveMeanStorey(entry):
    # We will focus on the first 2 digits of the entry
    firstPart = entry[0]
    secondPart = entry[1]
    
    # single digit storey
    if firstPart == '0':
        # convert second digit to float
        startStorey = float(secondPart)
        mean = (startStorey + startStorey + 2)/2
        return mean        
    else:
        # convert first and second digit to float
        startStorey = float(entry[:2])
        mean = (startStorey + startStorey + 2)/2
        return mean

def retrieveFloorArea(entry):
    return float(entry)

def retrieveRemainingLease(entry):
    # only digits
    if 'years' in entry:
        if 'months' in entry:
            years = float(entry[:2])
            months = entry[9:12]
            # bad entry '0 m' representation in 2017 to 2021 csv
            if months == '0 m':
                return years
            else:
                months = float(entry[9:12])/12          
                time = years + months
                return time
        else:
            years = float(entry[:2])
            return years
    else:
        return float(entry)
    
def retrieveDistanceToMrt(entry,streetNameLocation):
    return streetNameLocation[entry]["Nearest Distance To MRT/LRT"]

def retrieveDistanceToCbd(entry,streetNameLocation):
    return streetNameLocation[entry]['Nearest Distance to CBD']

def retrievePrice(entry):
    return float(entry)

In [11]:
# We will now initialise the numpy with zeros 
# https://www.dataquest.io/blog/numpy-tutorial-python/
xBlinded = np.zeros((len(data1),7))
xBlinded

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
y = np.zeros((len(data1),1))
y

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

In [13]:
# We will repopulate the data with independent and dependent variables
def populateBlindedX(data,xBlinded,streetNameLocation):
    for i in range(1,len(data)):
        row = data[i]
        # add the year
        xBlinded[i-1][0] = retrieveYear(row[0])
        
        # add the room number
        xBlinded[i-1][1] = retrieveRoom(row[2])
        
        # add the storey
        xBlinded[i-1][2] = retrieveMeanStorey(row[5])
        
        # add the floor area
        xBlinded[i-1][3] = retrieveFloorArea(row[6])
        
        # add the remaining lease
        xBlinded[i-1][4] = retrieveRemainingLease(row[9])
        
        # add the Distance to MRT/LRT
        xBlinded[i-1][5] = retrieveDistanceToMrt(row[4],streetNameLocation)
        
        # add the distance to CBD
        xBlinded[i-1][6] = retrieveDistanceToCbd(row[4],streetNameLocation)
              
    return xBlinded

xBlinded = populateBlindedX(data1,xBlinded,streetNameLocation)
np.set_printoptions(suppress=True,precision = 2)
xBlinded

array([[ 2015.08,     3.  ,     2.  , ...,    65.  ,   741.18,  8753.96],
       [ 2015.08,     3.  ,     2.  , ...,    64.  ,   208.97, 10685.21],
       [ 2015.08,     3.  ,     2.  , ...,    63.  ,   741.18,  8753.96],
       ...,
       [ 2021.85,     5.  ,     5.  , ...,    66.  ,   225.9 , 14625.29],
       [ 2021.92,     5.  ,    11.  , ...,    64.92,   225.9 , 14625.29],
       [    0.  ,     0.  ,     0.  , ...,     0.  ,     0.  ,     0.  ]])

In [14]:
# Create the unblinded dataset
# We will now initialise the numpy with zeros
xUnblinded = np.zeros((len(data1),33))
xUnblinded

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [15]:
# Return dictionary of town names that map to the location in the array
def retrieveTownNames(data1):
    townName = {}
    count = 7
    for i in range(1,len(data1)):
        row = data1[i]
        if row[1] not in townName:
            townName[row[1]] = count
            # inclement count + 1
            count += 1
        else:
            continue
    return townName
townName = retrieveTownNames(data1)
townName

{'ANG MO KIO': 7,
 'BEDOK': 8,
 'BISHAN': 9,
 'BUKIT BATOK': 10,
 'BUKIT MERAH': 11,
 'BUKIT PANJANG': 12,
 'BUKIT TIMAH': 13,
 'CENTRAL AREA': 14,
 'CHOA CHU KANG': 15,
 'CLEMENTI': 16,
 'GEYLANG': 17,
 'HOUGANG': 18,
 'JURONG EAST': 19,
 'JURONG WEST': 20,
 'KALLANG/WHAMPOA': 21,
 'MARINE PARADE': 22,
 'PASIR RIS': 23,
 'PUNGGOL': 24,
 'QUEENSTOWN': 25,
 'SEMBAWANG': 26,
 'SENGKANG': 27,
 'SERANGOON': 28,
 'TAMPINES': 29,
 'TOA PAYOH': 30,
 'WOODLANDS': 31,
 'YISHUN': 32}

In [17]:
# We will create our unblinded dataset

def populateUnblindedX(data,xUnblinded,streetNameLocation,townName):
    for i in range(1,len(data)):
        row = data[i]
        # add the year
        xUnblinded[i-1][0] = retrieveYear(row[0])
        
        # add the room number
        xUnblinded[i-1][1] = retrieveRoom(row[2])
        
        # add the storey
        xUnblinded[i-1][2] = retrieveMeanStorey(row[5])
        
        # add the floor area
        xUnblinded[i-1][3] = retrieveFloorArea(row[6])
        
        # add the remaining lease
        xUnblinded[i-1][4] = retrieveRemainingLease(row[9])
        
        # add the Distance to MRT/LRT
        xUnblinded[i-1][5] = retrieveDistanceToMrt(row[4],streetNameLocation)
        
        # add the distance to CBD
        xUnblinded[i-1][6] = retrieveDistanceToCbd(row[4],streetNameLocation)
        
        # add the 1 to the correct location
        xUnblinded[i-1][townName[row[1]]] = 1
              
    return xUnblinded

xUnblinded = populateUnblindedX(data1,xUnblinded,streetNameLocation,townName)
np.set_printoptions(suppress=True)
xUnblinded

array([[2015.08,    3.  ,    2.  , ...,    0.  ,    0.  ,    0.  ],
       [2015.08,    3.  ,    2.  , ...,    0.  ,    0.  ,    0.  ],
       [2015.08,    3.  ,    2.  , ...,    0.  ,    0.  ,    0.  ],
       ...,
       [2021.85,    5.  ,    5.  , ...,    0.  ,    0.  ,    1.  ],
       [2021.92,    5.  ,   11.  , ...,    0.  ,    0.  ,    1.  ],
       [   0.  ,    0.  ,    0.  , ...,    0.  ,    0.  ,    0.  ]])

In [19]:
def populateY(data,y):
    for i in range(1,len(data)):
        row = data[i]
        y[i-1][0] = float(row[10])
    return y

y = populateY(data1,y)
y

array([[275000.],
       [285000.],
       [290000.],
       ...,
       [645000.],
       [658000.],
       [     0.]])

In [20]:
# Feature scaling to ensure all the variables are equally represented
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [21]:
# Feature Scaling for the xBlinded dataset
xBlindedFS = sc.fit_transform(xBlinded)
xBlindedFS

array([[  -0.64,   -1.27,   -1.14, ...,   -0.73,    1.94,   -0.14],
       [  -0.64,   -1.27,   -1.14, ...,   -0.81,   -0.46,    0.29],
       [  -0.64,   -1.27,   -1.14, ...,   -0.89,    1.94,   -0.14],
       ...,
       [   0.54,    1.32,   -0.63, ...,   -0.66,   -0.38,    1.17],
       [   0.55,    1.32,    0.39, ...,   -0.74,   -0.38,    1.17],
       [-350.47,   -5.15,   -1.48, ...,   -5.66,   -1.4 ,   -2.1 ]])

In [22]:
import statsmodels.api as sm

# Add the constant variable once
xBlindedFS1 = sm.add_constant(xBlindedFS)
xBlindedFS1

array([[   1.  ,   -0.64,   -1.27, ...,   -0.73,    1.94,   -0.14],
       [   1.  ,   -0.64,   -1.27, ...,   -0.81,   -0.46,    0.29],
       [   1.  ,   -0.64,   -1.27, ...,   -0.89,    1.94,   -0.14],
       ...,
       [   1.  ,    0.54,    1.32, ...,   -0.66,   -0.38,    1.17],
       [   1.  ,    0.55,    1.32, ...,   -0.74,   -0.38,    1.17],
       [   1.  , -350.47,   -5.15, ...,   -5.66,   -1.4 ,   -2.1 ]])

In [23]:
# We assumed the name of town does not influence the price of the HDB.
model = sm.OLS(y,xBlindedFS1).fit()

# Run the predictions
predictions1 = model.predict(xBlindedFS1)

# Add the variables name 
#(https://stackoverflow.com/questions/36561897/naming-explanatory-variables-in-regression-output)
print(model.summary2(xname=['Constant','Year','Room Number','Storey','floor area',
                            'Remaining Lease','Distance To MRT/LRT',
                            'Distance To CBD'],yname = 'resale_price'))


                         Results: Ordinary least squares
Model:                  OLS                  Adj. R-squared:         0.701       
Dependent Variable:     resale_price         AIC:                    3558084.8083
Date:                   2022-01-03 09:01     BIC:                    3558163.6057
No. Observations:       140039               Log-Likelihood:         -1.7790e+06 
Df Model:               7                    F-statistic:            4.681e+04   
Df Residuals:           140031               Prob (F-statistic):     0.00        
R-squared:              0.701                Scale:                  6.3383e+09  
---------------------------------------------------------------------------------
                       Coef.    Std.Err.     t     P>|t|     [0.025      0.975]  
---------------------------------------------------------------------------------
Constant            437124.7705 212.7466 2054.6738 0.0000 436707.7913 437541.7497
Year                  4891.2405 212.9230 

In [25]:
# find the mean of Resale Price
mean = np.mean(y)
mean

437124.7704598719

In [27]:
# Regression error for xBlinded dataset
from sklearn import metrics

MAE1 = metrics.mean_absolute_error(y,predictions1)
MSE1 = metrics.mean_squared_error(y,predictions1)
RMSE1 = np.sqrt(metrics.mean_squared_error(y,predictions1))

print('Mean Absolute Error:', MAE1)
print('Mean Squared Error:', MSE1)
print('Root Mean Squared Error:', RMSE1)

Mean Absolute Error: 60968.28232618647
Mean Squared Error: 6337956763.354466
Root Mean Squared Error: 79611.28540197342


In [28]:
# Error rate

ErrorRate1 = (RMSE1/mean)*100
print(ErrorRate1)

18.21248549200708


In [29]:
# Feature scaling for the xUnblinded
# Feature Scaling for the overall X dataset
xUnblindedFS = sc.fit_transform(xUnblinded)
xUnblindedFS

array([[  -0.64,   -1.27,   -1.14, ...,   -0.19,   -0.27,   -0.27],
       [  -0.64,   -1.27,   -1.14, ...,   -0.19,   -0.27,   -0.27],
       [  -0.64,   -1.27,   -1.14, ...,   -0.19,   -0.27,   -0.27],
       ...,
       [   0.54,    1.32,   -0.63, ...,   -0.19,   -0.27,    3.76],
       [   0.55,    1.32,    0.39, ...,   -0.19,   -0.27,    3.76],
       [-350.47,   -5.15,   -1.48, ...,   -0.19,   -0.27,   -0.27]])

In [30]:
# We will create our constant to the X column
xUnblindedFS1 = sm.add_constant(xUnblindedFS)
xUnblindedFS1

array([[   1.  ,   -0.64,   -1.27, ...,   -0.19,   -0.27,   -0.27],
       [   1.  ,   -0.64,   -1.27, ...,   -0.19,   -0.27,   -0.27],
       [   1.  ,   -0.64,   -1.27, ...,   -0.19,   -0.27,   -0.27],
       ...,
       [   1.  ,    0.54,    1.32, ...,   -0.19,   -0.27,    3.76],
       [   1.  ,    0.55,    1.32, ...,   -0.19,   -0.27,    3.76],
       [   1.  , -350.47,   -5.15, ...,   -0.19,   -0.27,   -0.27]])

In [31]:
# We assumed the name of town does not influence the price of the HDB.
model = sm.OLS(y,xUnblindedFS1).fit()

# Run the predictions
predictions2 = model.predict(xUnblindedFS1)

# Add the variables name 
print(model.summary2(xname = ['Constant',
                             'Year',
                             'Room Number',
                             'Storey',
                             'floor area',
                             'Remaining Lease',
                             'Distance To MRT/LRT',
                             'Distance To CBD',
                             'Ang Mo Kio',
                             'Bedok',
                             'Bishan',
                             'Bukit Batok',
                             'Bukit Merah',
                             'Bukit Panjang',
                             'Bukit Timah',
                             'Central Area',
                             'Choa Chu Kang',
                             'Clementi',
                             'Geylang',
                             'Hougang',
                             'Jurong East',
                             'Jurong West',
                             'Kallang/Whampoa',
                             'Marine Parade',
                             'Pasir Ris',
                             'Punggol',
                             'QueenTown',
                             'Sembawang',
                             'Sengkang',
                             'Serangoon',
                             'Tampines',
                             'Toa Payoh',
                             'Woodlands',
                             'Yishun'],
                             yname = 'resale_price'))


                             Results: Ordinary least squares
Model:                      OLS                    Adj. R-squared:           0.828       
Dependent Variable:         resale_price           AIC:                      3480746.8431
Date:                       2022-01-03 09:01       BIC:                      3481081.7321
No. Observations:           140039                 Log-Likelihood:           -1.7403e+06 
Df Model:                   33                     F-statistic:              2.038e+04   
Df Residuals:               140005                 Prob (F-statistic):       0.00        
R-squared:                  0.828                  Scale:                    3.6480e+09  
-----------------------------------------------------------------------------------------
                        Coef.      Std.Err.      t     P>|t|      [0.025        0.975]   
-----------------------------------------------------------------------------------------
Constant              437124.7705   161

In [32]:
# Regression error for X modified dataset
from sklearn import metrics

MAE2 = metrics.mean_absolute_error(y,predictions2)
MSE2 = metrics.mean_squared_error(y,predictions2)
RMSE2 = np.sqrt(metrics.mean_squared_error(y,predictions2))

print('Mean Absolute Error:', MAE2)
print('Mean Squared Error:', MSE2)
print('Root Mean Squared Error:', RMSE2)

Mean Absolute Error: 46334.98214794251
Mean Squared Error: 3647072974.417175
Root Mean Squared Error: 60391.00077343622


In [33]:
# Error rate

mean = np.mean(y)

ErrorRate2 = (RMSE2/mean)*100
print(ErrorRate2)

13.815506430784644
