In [59]:
import pandas as pd
import matplotlib.pyplot as plt
import requests
import json
import re
from datetime import datetime

In [60]:
df = pd.read_csv("../data/URA_data.csv")
df

Unnamed: 0,areaSqm,leaseDate,propertyType,district,areaSqft,noOfBedRoom,rent,street,x,y,project,refPeriod,psf75,median,psf25
0,120-130,721,Non-landed Properties,15,1300-1400,3,4200.0,ELLIOT ROAD,38865.17182,32621.92134,ELLIOT AT THE EAST COAST,2021Q3,3.09,2.78,2.62
1,90-100,721,Non-landed Properties,15,900-1000,2,3200.0,ELLIOT ROAD,38865.17182,32621.92134,ELLIOT AT THE EAST COAST,2021Q3,3.09,2.78,2.62
2,190-200,721,Non-landed Properties,15,2100-2200,3,5000.0,ELLIOT ROAD,38865.17182,32621.92134,ELLIOT AT THE EAST COAST,2021Q3,3.09,2.78,2.62
3,120-130,721,Non-landed Properties,15,1300-1400,3,3500.0,ELLIOT ROAD,38865.17182,32621.92134,ELLIOT AT THE EAST COAST,2021Q3,3.09,2.78,2.62
4,240-250,921,Non-landed Properties,15,2600-2700,4,5500.0,ELLIOT ROAD,38865.17182,32621.92134,ELLIOT AT THE EAST COAST,2021Q3,3.09,2.78,2.62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281778,60-70,1223,Non-landed Properties,8,600-700,3,5500.0,PERUMAL ROAD,30477.87296,32796.58779,UPTOWN @ FARRER,2023Q4,7.43,7.15,6.49
281779,40-50,1023,Non-landed Properties,8,500-600,2,4000.0,PERUMAL ROAD,30477.87296,32796.58779,UPTOWN @ FARRER,2023Q4,7.43,7.15,6.49
281780,70-80,1023,Non-landed Properties,8,700-800,3,5300.0,PERUMAL ROAD,30477.87296,32796.58779,UPTOWN @ FARRER,2023Q4,7.43,7.15,6.49
281781,40-50,1123,Non-landed Properties,8,500-600,2,4000.0,PERUMAL ROAD,30477.87296,32796.58779,UPTOWN @ FARRER,2023Q4,7.43,7.15,6.49


# Transform

In [61]:
# IQR is NaN for properties with less than 10 rental contracts for the reference period
# It is a significant amount of data. Will not drop them.

# areaSqft are numerical intervals that are consecutive and mostly equal
# convert these to a single number (the midpoint of the interval) and use this as a quantitative variable
# there are some unbounded intervals e.g. <=1000, >3000, >8000 which will be replaced with the boundary itself
def format_area(s):
    try:
        lower, upper = s.split("-")
        lower = int(lower)
        upper = int(upper)
        return (lower + upper) / 2
    except ValueError:
        pattern = r"(^<|>=|<=|>)(\d+)"
        match = re.match(pattern, s)
        return float(match.group(2))
df["areaSqft_formatted"] = df["areaSqft"].apply(format_area)

# change leaseDate to datetime and extract year, quarter, month as new features
df["leaseDate"] = df.apply(lambda row: datetime(int(row["refPeriod"][:4]), row["leaseDate"] // 100, int(row["refPeriod"][-1])), axis=1)
df["leaseYear"] = df["leaseDate"].dt.year
df["leaseQuarter"] = df["leaseDate"].dt.quarter
df["leaseMonth"] = df["leaseDate"].dt.month

# Since the IQR is in per square feet, we will not use areaSqm. 
# Original areaSqft will also not be used
df = df.drop(columns=["areaSqft", "areaSqm"])

# df.to_csv("../data/rental_data_for_BI.csv", index=False)

# one hot encoding of categorical features district and propertyType
df['district'] = df['district'].astype('category')
df['propertyType'] = df['propertyType'].astype('category')
df = pd.get_dummies(df, columns=["district", "propertyType"], drop_first=True)

# df.to_csv("../data/URA_data_transformed.csv", index=False)


In [62]:
df

Unnamed: 0,leaseDate,noOfBedRoom,rent,street,x,y,project,refPeriod,psf75,median,...,district_22,district_23,district_25,district_26,district_27,district_28,propertyType_Executive Condominium,propertyType_Non-landed Properties,propertyType_Semi-Detached House,propertyType_Terrace House
0,2021-07-03,3,4200.0,ELLIOT ROAD,38865.17182,32621.92134,ELLIOT AT THE EAST COAST,2021Q3,3.09,2.78,...,False,False,False,False,False,False,False,True,False,False
1,2021-07-03,2,3200.0,ELLIOT ROAD,38865.17182,32621.92134,ELLIOT AT THE EAST COAST,2021Q3,3.09,2.78,...,False,False,False,False,False,False,False,True,False,False
2,2021-07-03,3,5000.0,ELLIOT ROAD,38865.17182,32621.92134,ELLIOT AT THE EAST COAST,2021Q3,3.09,2.78,...,False,False,False,False,False,False,False,True,False,False
3,2021-07-03,3,3500.0,ELLIOT ROAD,38865.17182,32621.92134,ELLIOT AT THE EAST COAST,2021Q3,3.09,2.78,...,False,False,False,False,False,False,False,True,False,False
4,2021-09-03,4,5500.0,ELLIOT ROAD,38865.17182,32621.92134,ELLIOT AT THE EAST COAST,2021Q3,3.09,2.78,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281778,2023-12-04,3,5500.0,PERUMAL ROAD,30477.87296,32796.58779,UPTOWN @ FARRER,2023Q4,7.43,7.15,...,False,False,False,False,False,False,False,True,False,False
281779,2023-10-04,2,4000.0,PERUMAL ROAD,30477.87296,32796.58779,UPTOWN @ FARRER,2023Q4,7.43,7.15,...,False,False,False,False,False,False,False,True,False,False
281780,2023-10-04,3,5300.0,PERUMAL ROAD,30477.87296,32796.58779,UPTOWN @ FARRER,2023Q4,7.43,7.15,...,False,False,False,False,False,False,False,True,False,False
281781,2023-11-04,2,4000.0,PERUMAL ROAD,30477.87296,32796.58779,UPTOWN @ FARRER,2023Q4,7.43,7.15,...,False,False,False,False,False,False,False,True,False,False
