In [13]:
import numpy as np
import pandas as pd

In [2]:
# Read data from csv
df = pd.read_csv("data/rental_prices_singapore.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5360 entries, 0 to 5359
Data columns (total 9 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   name                           5360 non-null   object 
 1   address                        5011 non-null   object 
 2   price                          5360 non-null   object 
 3   size                           5360 non-null   object 
 4   bedrooms                       5360 non-null   object 
 5   bathrooms                      4941 non-null   float64
 6   property_type_furnishing_year  5360 non-null   object 
 7   mrt_distance                   4641 non-null   object 
 8   agent_description              5360 non-null   object 
dtypes: float64(1), object(8)
memory usage: 377.0+ KB


In [4]:
df[:10]

Unnamed: 0,name,address,price,size,bedrooms,bathrooms,property_type_furnishing_year,mrt_distance,agent_description
0,"Brand new Attic Studio, in a Peranakan Conserv...",Lorong 34 Geylang,3000,400 sqft,1,1.0,\nApartment\nFully Furnished\n,,One and only attic studio! Beautifully done up...
1,Astor,51C Lengkong Empat,2000,1130 sqft,Room,,\nApartment\nFully Furnished\n,11 mins (810 m) to DT28 Kaki Bukit MRT,Comes with In House Maid
2,Springhill Terrace,Sunrise avenue,7400,3800 sqft,5,4.0,\nApartment\nFully Furnished\n,,"Close to MRT and short drive to French, Austra..."
3,704 Yishun Avenue 5,704 Yishun Avenue 5,1000,120 sqft,Room,,\nApartment\nFully Furnished\n,9 mins (700 m) to NS13 Yishun MRT,Room for 1 or 2 single ladies
4,Espada,48 Saint Thomas Walk,4300,689 sqft,1,1.0,\nApartment\nFully Furnished\n,6 mins (420 m) to NS23 Somerset MRT,All units virtual online viewing available! An...
5,Elizabeth Heights,57 Cairnhill Road,10800,2646 sqft,4,4.0,\nApartment\nFully Furnished\n,8 mins (600 m) to NS22 Orchard MRT,"Exclusive Penthouse unit, $400k reno, unblocke..."
6,38B Eunos Road 2,38B Eunos Road 2,4800,1001 sqft,3,2.0,\nApartment\nFully Furnished\n,4 mins (310 m) to EW7 Eunos MRT,Newly built HDB flat for rent - Walking distan...
7,Vacanza @ East,48 Lengkong Tujoh,4600,1023 sqft,3,2.0,\nApartment\nFully Furnished\n,10 mins (770 m) to DT29 Bedok North MRT,Super rare 3 bedrooms 2 bathrooms pool facing ...
8,Blossoms @ Woodleigh,12 Woodleigh Close,8000,1410 sqft,4,3.0,\nApartment\nFully Furnished\n,5 mins (370 m) to NE11 Woodleigh MRT,Exclusive 4 bed for rent
9,Barker Road Semi-Detached,Barker Road,15000,"4500 sqft (floor), 5853 sqft (land)",5,5.0,\nApartment\nFully Furnished\n,12 mins (870 m) to NS21 Newton MRT,Barker Road District 11 Super Rare Gem Semi De...


In [6]:
# Diagnose duplicates
df.duplicated().value_counts()

False    5082
True      278
dtype: int64

In [10]:
# Remove duplicates
df = df.drop_duplicates().copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5082 entries, 0 to 5359
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   name                           5082 non-null   object 
 1   address                        4754 non-null   object 
 2   price                          5082 non-null   object 
 3   size                           5082 non-null   object 
 4   bedrooms                       5082 non-null   object 
 5   bathrooms                      4680 non-null   float64
 6   property_type_furnishing_year  5082 non-null   object 
 7   mrt_distance                   4418 non-null   object 
 8   agent_description              5082 non-null   object 
 9   property_type                  5082 non-null   object 
dtypes: float64(1), object(9)
memory usage: 436.7+ KB


In [53]:
# Feature extraction: Property type 
df["property_type"] = df["property_type_furnishing_year"].str.split("\n").str[1]

In [49]:
# Feature extraction: Property type 
def extract_type(string):
    if "Condominium" in string:
        return "Condominium"
    elif "Apartment" in string:
        return "Apartment"
    elif "HDB Flat" in string:
        return "HDB Flat"
    elif "Semi-Detached House" in string:
        return "Semi-Detached House"
    elif "Good Class Bungalow" in string:
        return "Good Class Bungalow"
    elif "Corner Terrace" in string:
        return "Corner Terrace"
    elif "Detached House" in string:
        return "Detached House"
    elif "Executive Condominium" in string:
        return "Executive Condominium"
    elif "Terraced House" in string:
        return "Terraced House"
    elif "Bungalow House" in string:
        return "Bungalow House"
    elif "Cluster House" in string:
        return "Cluster House"
    else:
        return np.nan
        
df["property_type"] = df["property_type_furnishing_year"].apply(extract_type)

In [54]:
# Frequencies of property types
df["property_type"].value_counts()

Condominium              2069
Apartment                 865
HDB Flat                  763
New Project: 2023         367
New Project: 2024         314
Semi-Detached House       188
Good Class Bungalow       174
Corner Terrace            103
Detached House             79
Executive Condominium      60
Terraced House             40
Bungalow House             40
Cluster House              20
Name: property_type, dtype: int64

In [52]:
# Frequencies of property types
df["property_type"].value_counts()

Condominium            2770
Apartment               905
HDB Flat                763
Semi-Detached House     188
Good Class Bungalow     174
Corner Terrace          103
Detached House           79
Terraced House           40
Bungalow House           40
Cluster House            20
Name: property_type, dtype: int64

In [37]:
# Feature extraction: Furnishing 
def extract_furnishing(string):
    if "Fully Furnished" in string:
        return "Fully Furnished"
    elif "Partially Furnished" in string:
        return "Partially Furnished"
    elif "Unfurnished" in string:
        return "Unfurnished"
    else:
        return np.nan
        
df["furnishing"] = df["property_type_furnishing_year"].apply(extract_furnishing)

In [43]:
# Frequencies of furnishing
df["furnishing"].value_counts()

Partially Furnished    3058
Fully Furnished        1600
Unfurnished              84
Name: furnishing, dtype: int64

In [9]:
# Feature extraction: Built year 
df["year"] = df["property_type_furnishing_year"].str.split("\n").str[3]

In [10]:
# Frequencies of built year
df["year"].value_counts()

                       1029
Partially Furnished     480
Built: 2013             323
Built: 2017             300
Built: 2010             266
Built: 2011             220
Built: 2007             182
Built: 2009             146
Built: 2016             140
Built: 2015             140
Built: 2021             140
Built: 1997             123
Built: 2014             120
Built: 2012             120
Fully Furnished         120
Built: 2022             102
Built: 2008             100
Built: 2005             100
Built: 1999              80
Built: 1995              80
Built: 2019              80
Built: 2000              63
Built: 2001              63
Built: 1994              60
Built: 1993              60
Built: 2004              60
Built: 1984              60
Built: 1985              60
Built: 1996              40
Built: 1998              40
Built: 1987              40
Built: 2018              40
Completion: 2023         40
Built: 1981              40
Built: 1975              40
Built: 2003         