### 3.C.2
2 regressions will be conducted in this ipynb.

##### Data manipulation
1.Downtown line stage 2 (DT2) MRT data and resale flat Data will be imported and merged \
2.Flat type will be encoded \
3.Remaining years on lease data wil be calculated \
4.Using a list of DT2 MRT names, create a dummy column for rows where the closest MRT is DT2 MRT (for angle 2 analysis) \
5.Restrict data to between years 2015 and 2016 (inclusive) \
6.Conduct regression for angle 1 \
7.Conduct regression for angle 2 



In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as smf  
from statsmodels.formula.api import ols
from statsmodels.stats.meta_analysis import effectsize_smd
from sklearn.preprocessing import StandardScaler
from causalinference import CausalModel


In [4]:
#global values

#Downtown stage 2 dummy file path
DT2_fp="Relevant_datasets/3_C_1_Address_DT2.csv"


#Resale file path
Resale_fp="Relevant_datasets/RSF90onw_wAddress.csv"

In [5]:
#import Downtown line 2
DT2=pd.read_csv(DT2_fp,index_col=0)
#imprt Resale Flat dataset
RSF=pd.read_csv(Resale_fp,index_col=0,low_memory=False)

In [6]:
#Merge Downtown line dummy variable data with RSF to show flats in towns where downtown line stage 2 stations were built.
RSF_DT=RSF.merge(DT2,on='street_name_x',how='left')
#sanity check, there should be no null DT2 rows (all should be 0 or 1)
RSF_DT.loc[RSF_DT.DT2.isnull()]

Unnamed: 0,month,town_x,flat_type,block,street_name_x,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,...,Year_x,Quarter_x,Yr_Qt,Year_y,Quarter_y,Index,Real_resale_price,town_y,DT2,town_oldver


In [7]:
RSF_DT['flat_type'].unique()
#to assume a linear relationship btw flat type and resale price.
#assume executive>multi-gen>5 room in general in terms of resale price
RSF_DT['flat_type_n']=0
RSF_DT.loc[RSF_DT['flat_type']=='1 ROOM','flat_type_n']=1
RSF_DT.loc[RSF_DT['flat_type']=='2 ROOM','flat_type_n']=2
RSF_DT.loc[RSF_DT['flat_type']=='3 ROOM','flat_type_n']=3
RSF_DT.loc[RSF_DT['flat_type']=='4 ROOM','flat_type_n']=4
RSF_DT.loc[RSF_DT['flat_type']=='5 ROOM','flat_type_n']=5
RSF_DT.loc[RSF_DT['flat_type']=='MULTI-GENERATION','flat_type_n']=6
RSF_DT.loc[RSF_DT['flat_type']=='MULTI GENERATION','flat_type_n']=6
RSF_DT.loc[RSF_DT['flat_type']=='EXECUTIVE','flat_type_n']=7
print("no. of unchanged rows:",RSF_DT.loc[RSF_DT['flat_type_n']==0]['flat_type_n'].count())

#create a column calculating the remaining lease (some rows do not have remaining lease data so i will re-create using lease_commence_date and assuming all houses have 99 years of lease at lease commenment
RSF_DT['Lease_rem_years']=99-(RSF_DT['Year_x']-RSF_DT['lease_commence_date'])





no. of unchanged rows: 0


In [8]:
#According to wikipedia: Downtown line stage 2 stations were released in 27 December 2015. We will hence assume that the 
#intervention started in 01-01-2016 (4 days after the official release date)
#link:https://en.wikipedia.org/wiki/Downtown_MRT_line

#For further analysis:what is the impact of DT2 lines on the flats (not towns) it is closest to? (flats where closest MRT is one of the DT2 stations)
#Get name of station of DT2 MRT
MRT_DT2=['BUKIT PANJANG LRT STATION','CASHEW MRT STATION','HILLVIEW MRT STATION','KING ALBERT PARK MRT STATION','SIXTH AVENUE MRT STATION','TAN KAH KEE MRT STATION','BOTANIC GARDENS MRT STATION','STEVENS MRT STATION','NEWTON MRT STATION','LITTLE INDIA MRT STATION','ROCHOR MRT STATION']

RSF_DT['CL_MRT_DT2']=RSF_DT.apply(lambda x: 1 if x.Cl_MRT in MRT_DT2 else 0,axis=1)

#Sanity check to see if stations were mapped properly, double checked the list to see if the unmapped MRT stations appeared in the list of Cl_MRT stations in the RSF dataset
RSF_DT.loc[RSF_DT['CL_MRT_DT2']==1]['Cl_MRT'].unique()

array(['HILLVIEW MRT STATION', 'ROCHOR MRT STATION',
       'LITTLE INDIA MRT STATION', 'CASHEW MRT STATION',
       'BUKIT PANJANG LRT STATION'], dtype=object)

In [14]:
#To Narrow the analysis to the impact of Downtown line stage 2 stations on resale flat prices 2 years before (2014) and 2 years after (2018) 2016.
RSF_DT=RSF_DT.loc[(RSF_DT['Year_x']<2017)&(RSF_DT['Year_x']>=2015)]

#Create a dummy for sales before and after 2016(intervention period)
RSF_DT['Post']=RSF_DT.apply(lambda x: 1 if x['Year_x']>=2016 else 0,axis=1)


#prepare dataset for regression, reset index for PSM and create a subset columns
RSF_DT.reset_index(inplace=True)
RSF_DT=RSF_DT[['index','floor_area_sqm','flat_type_n','Lease_rem_years','DT2','Post','Real_resale_price','CL_MRT_DT2']]


In [19]:
#PSM packages kept running overtime (i.e. system ran out of memory so i will conduct DiD without PSM)
#but this is how i would conduct PSM and i will conduct another round of comparison with a summary statistics table to ensure that Treatment and control groups are the same
'''
psm = PsmPy(RSF_DT, treatment='DT2', indx='index', exclude = ['Post'])
psm.logistic_ps(balance = True)
psm.predicted_data
psm.logistic_ps(balance = True)
psm.knn_matched(matcher='propensity_score', replacement=False, caliper=None, drop_unmatched=True)
psm.matched_ids
'''

"\npsm = PsmPy(RSF_DT, treatment='DT2', indx='index', exclude = ['Post'])\npsm.logistic_ps(balance = True)\npsm.predicted_data\npsm.logistic_ps(balance = True)\npsm.knn_matched(matcher='propensity_score', replacement=False, caliper=None, drop_unmatched=True)\npsm.matched_ids\n"

In [22]:
#Create a DiD with controls for usual determinants of resale price (floor area,flat type,lease remaining years)
model=ols(formula='Real_resale_price ~ Post + DT2 + DT2*Post + floor_area_sqm + flat_type_n +Lease_rem_years',data=RSF_DT).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:      Real_resale_price   R-squared:                       0.451
Model:                            OLS   Adj. R-squared:                  0.451
Method:                 Least Squares   F-statistic:                     5637.
Date:                Sun, 16 Jun 2024   Prob (F-statistic):               0.00
Time:                        16:17:38   Log-Likelihood:            -5.4794e+05
No. Observations:               41193   AIC:                         1.096e+06
Df Residuals:                   41186   BIC:                         1.096e+06
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept       -3.383e+04   5076.276     

### Further analysis 
what is the impact of DT2 lines on the flats it is closest to? (flats where closest MRT is one of the DT2 stations)

In [27]:
#Create a DiD with controls for usual determinants of resale price (floor area,flat type,lease remaining years)
model=ols(formula='Real_resale_price ~ Post + CL_MRT_DT2 + CL_MRT_DT2*Post + floor_area_sqm + flat_type_n +Lease_rem_years',data=RSF_DT).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:      Real_resale_price   R-squared:                       0.441
Model:                            OLS   Adj. R-squared:                  0.441
Method:                 Least Squares   F-statistic:                     5418.
Date:                Sun, 16 Jun 2024   Prob (F-statistic):               0.00
Time:                        16:17:49   Log-Likelihood:            -5.4831e+05
No. Observations:               41193   AIC:                         1.097e+06
Df Residuals:                   41186   BIC:                         1.097e+06
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept       -2.164e+04   5075.157     