In [1]:
import pandas as pd
import requests
import numpy as np
import statsmodels as sm
import statsmodels.api as smf  
from statsmodels.formula.api import ols #formula guide: https://www.statsmodels.org/dev/examples/notebooks/generated/formulas.html

In order to know if flat sizes have gotten smaller, we will need to look specifically at the changes in sizes of new,rather than resale flats over time.
We can use the information we have gotten from resale flats, and compare the sizes of flats across their lease_commence_date, and control for flat type.

In [3]:
#Resale file path
Resale_fp="Relevant_datasets/RSF90onw_wAddress.csv"

In [4]:
#Import resale price dataset (set low memory to false due to error msg)
RSF=pd.read_csv(Resale_fp,index_col=0,low_memory=False)

In [5]:
RSF['flat_type'].unique()
#to assume a linear relationship btw flat type and resale price.
#assume executive>multi-gen>5 room in general in terms of resale price
RSF['flat_type_n']=0
RSF.loc[RSF['flat_type']=='1 ROOM','flat_type_n']=1
RSF.loc[RSF['flat_type']=='2 ROOM','flat_type_n']=2
RSF.loc[RSF['flat_type']=='3 ROOM','flat_type_n']=3
RSF.loc[RSF['flat_type']=='4 ROOM','flat_type_n']=4
RSF.loc[RSF['flat_type']=='5 ROOM','flat_type_n']=5
RSF.loc[RSF['flat_type']=='MULTI-GENERATION','flat_type_n']=6
RSF.loc[RSF['flat_type']=='MULTI GENERATION','flat_type_n']=6
RSF.loc[RSF['flat_type']=='EXECUTIVE','flat_type_n']=7
print("no. of unchanged rows:",RSF.loc[RSF['flat_type_n']==0]['flat_type_n'].count())

no. of unchanged rows: 0


In [6]:
RSF.columns

Index(['month', 'town', 'flat_type', 'block', 'street_name_x', 'storey_range',
       'floor_area_sqm', 'flat_model', 'lease_commence_date', 'resale_price',
       'remaining_lease', 'street_name_y', 'Lat', 'Lon', 'street_names_oldver',
       'Coordinates', 'rLat', 'rLon', 'Cl_MRT', 'Distance_MRT_km', 'Year_x',
       'Quarter_x', 'Yr_Qt', 'Year_y', 'Quarter_y', 'Index',
       'Real_resale_price', 'flat_type_n'],
      dtype='object')

In [7]:
#To make the coefficients and intercepts for regression table more readable, will 
#convert lease commencement date (LCD) to years after 1966 (lease commencement date of the oldest flat)
#Yr_aft_LCD=lease_commence_date-1966
RSF['Flat_Age_2024']=2024 - RSF['lease_commence_date']


In [8]:
model=ols(formula="floor_area_sqm ~ Flat_Age_2024 + flat_type_n",data=RSF).fit()

In [9]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         floor_area_sqm   R-squared:                       0.875
Model:                            OLS   Adj. R-squared:                  0.875
Method:                 Least Squares   F-statistic:                 3.244e+06
Date:                Mon, 17 Jun 2024   Prob (F-statistic):               0.00
Time:                        01:10:34   Log-Likelihood:            -3.3657e+06
No. Observations:              927061   AIC:                         6.731e+06
Df Residuals:                  927058   BIC:                         6.732e+06
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        13.8141      0.060    231.350

According to the results of the regression, there is a statistically significant (at 1% lvl) positive correlation between the age of the flat (at 2024) and the size of homes (flat sq m) after controlling for flat types, which could confound the size of the flats.Hence, there is truth in the statement that newer flats are getting smaller.