In [1]:
import numpy as np
import pandas as pd
import re

# Read data

In [2]:
# Read data from csv
df = pd.read_csv("data/rental_prices_singapore.csv")

In [3]:
# Show dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5360 entries, 0 to 5359
Data columns (total 9 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   name                           5360 non-null   object 
 1   address                        5011 non-null   object 
 2   price                          5360 non-null   object 
 3   size                           5360 non-null   object 
 4   bedrooms                       5360 non-null   object 
 5   bathrooms                      4941 non-null   float64
 6   property_type_furnishing_year  5360 non-null   object 
 7   mrt_distance                   4641 non-null   object 
 8   agent_description              5360 non-null   object 
dtypes: float64(1), object(8)
memory usage: 377.0+ KB


In [4]:
# Show top and bottom five rows
df

Unnamed: 0,name,address,price,size,bedrooms,bathrooms,property_type_furnishing_year,mrt_distance,agent_description
0,"Brand new Attic Studio, in a Peranakan Conserv...",Lorong 34 Geylang,3000,400 sqft,1,1.0,\nApartment\nFully Furnished\n,,One and only attic studio! Beautifully done up...
1,Astor,51C Lengkong Empat,2000,1130 sqft,Room,,\nApartment\nFully Furnished\n,11 mins (810 m) to DT28 Kaki Bukit MRT,Comes with In House Maid
2,Springhill Terrace,Sunrise avenue,7400,3800 sqft,5,4.0,\nApartment\nFully Furnished\n,,"Close to MRT and short drive to French, Austra..."
3,704 Yishun Avenue 5,704 Yishun Avenue 5,1000,120 sqft,Room,,\nApartment\nFully Furnished\n,9 mins (700 m) to NS13 Yishun MRT,Room for 1 or 2 single ladies
4,Espada,48 Saint Thomas Walk,4300,689 sqft,1,1.0,\nApartment\nFully Furnished\n,6 mins (420 m) to NS23 Somerset MRT,All units virtual online viewing available! An...
...,...,...,...,...,...,...,...,...,...
5355,Avenue South Residence,13 Silat Avenue,7500,947 sqft,3,2.0,\nBungalow House\nPartially Furnished\n,,Newly TOP! Be the first to stay near town!
5356,453B Fernvale Road,453B Fernvale Road,999,150 sqft,Room,,\nBungalow House\nPartially Furnished\n,4 mins (290 m) to SW5 Fernvale LRT,Close to fernvale LRT
5357,Riviere,1 Jiak Kim Street,8000,840 sqft,2,2.0,\nBungalow House\nPartially Furnished\n,5 mins (370 m) to TE16 Havelock MRT,Brand new pool facing unit!
5358,Bukit Villas,1 Rasok Drive,7100,"3025 sqft (floor), 2256 sqft (land)",5,4.0,\nBungalow House\nPartially Furnished\n,,"Tranquil surroundings, facilities included! Po..."


# Duplicates

In [5]:
# Diagnose duplicates
df.duplicated().value_counts()

False    5082
True      278
dtype: int64

In [None]:
# Remove duplicates
df = df.drop_duplicates().copy()

In [6]:
# Show dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5082 entries, 0 to 5359
Data columns (total 9 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   name                           5082 non-null   object 
 1   address                        4754 non-null   object 
 2   price                          5082 non-null   object 
 3   size                           5082 non-null   object 
 4   bedrooms                       5082 non-null   object 
 5   bathrooms                      4680 non-null   float64
 6   property_type_furnishing_year  5082 non-null   object 
 7   mrt_distance                   4418 non-null   object 
 8   agent_description              5082 non-null   object 
dtypes: float64(1), object(8)
memory usage: 397.0+ KB


# Feature extraction

## Property type

In [7]:
# Create function to extract property type 
def extract_type(string):
    if "Condominium" in string:
        return "Condominium"
    elif "Apartment" in string:
        return "Apartment"
    elif "HDB Flat" in string:
        return "HDB Flat"
    elif "Semi-Detached House" in string:
        return "Semi-Detached House"
    elif "Good Class Bungalow" in string:
        return "Good Class Bungalow"
    elif "Corner Terrace" in string:
        return "Corner Terrace"
    elif "Detached House" in string:
        return "Detached House"
    elif "Executive Condominium" in string:
        return "Executive Condominium"
    elif "Terraced House" in string:
        return "Terraced House"
    elif "Bungalow House" in string:
        return "Bungalow House"
    elif "Cluster House" in string:
        return "Cluster House"
    else:
        return np.nan

In [15]:
# Apply function to create property type column
df["property_type"] = df["property_type_furnishing_year"].apply(extract_type)

In [8]:
# Frequencies of property types
df["property_type"].value_counts()

Condominium            2770
Apartment               905
HDB Flat                763
Semi-Detached House     188
Good Class Bungalow     174
Corner Terrace          103
Detached House           79
Terraced House           40
Bungalow House           40
Cluster House            20
Name: property_type, dtype: int64

## Furnishing

In [16]:
# Create function to extract information about furnishing  
def extract_furnishing(string):
    if "Fully Furnished" in string:
        return "Fully Furnished"
    elif "Partially Furnished" in string:
        return "Partially Furnished"
    elif "Unfurnished" in string:
        return "Unfurnished"
    else:
        return np.nan

In [17]:
# Apply function to create furnishing column
df["furnishing"] = df["property_type_furnishing_year"].apply(extract_furnishing)

In [18]:
# Frequencies of furnishing
df["furnishing"].value_counts()

Partially Furnished    3058
Fully Furnished        1600
Unfurnished              84
Name: furnishing, dtype: int64

## Built year

In [19]:
# Create function to extract built year 
def extract_year(string):
    year = re.search(r"\b\d{4}\b", string)
    if year:
        return int(year.group())
    else:
        return np.nan

In [20]:
# Apply function to create built year column
df["year"] = df["property_type_furnishing_year"].apply(extract_year)

In [21]:
# Frequencies of built year
df["year"].value_counts()

2023.0    391
2024.0    314
2013.0    297
2017.0    293
2010.0    248
2011.0    239
2014.0    178
2007.0    167
2016.0    160
2009.0    145
2022.0    142
2021.0    140
2015.0    140
2012.0    112
2008.0     99
1997.0     87
2019.0     80
2005.0     80
1995.0     80
1999.0     80
2000.0     63
2001.0     62
1985.0     60
1994.0     60
2004.0     60
1984.0     60
1993.0     60
1986.0     43
1996.0     40
1998.0     40
1987.0     40
2018.0     40
1981.0     40
2003.0     40
1975.0     38
1992.0     20
1971.0     20
1991.0     20
2006.0     20
1983.0     20
1974.0     20
1979.0     20
2002.0     20
1977.0     20
Name: year, dtype: int64

In [13]:
# Show dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5082 entries, 0 to 5359
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   name                           5082 non-null   object 
 1   address                        4754 non-null   object 
 2   price                          5082 non-null   object 
 3   size                           5082 non-null   object 
 4   bedrooms                       5082 non-null   object 
 5   bathrooms                      4680 non-null   float64
 6   property_type_furnishing_year  5082 non-null   object 
 7   mrt_distance                   4418 non-null   object 
 8   agent_description              5082 non-null   object 
 9   property_type                  5082 non-null   object 
 10  furnishing                     4742 non-null   object 
 11  year                           4398 non-null   float64
dtypes: float64(2), object(10)
memory usage: 516.1+ K

In [14]:
# Show top and bottom five rows
df

Unnamed: 0,name,address,price,size,bedrooms,bathrooms,property_type_furnishing_year,mrt_distance,agent_description,property_type,furnishing,year
0,"Brand new Attic Studio, in a Peranakan Conserv...",Lorong 34 Geylang,3000,400 sqft,1,1.0,\nApartment\nFully Furnished\n,,One and only attic studio! Beautifully done up...,Apartment,Fully Furnished,
1,Astor,51C Lengkong Empat,2000,1130 sqft,Room,,\nApartment\nFully Furnished\n,11 mins (810 m) to DT28 Kaki Bukit MRT,Comes with In House Maid,Apartment,Fully Furnished,
2,Springhill Terrace,Sunrise avenue,7400,3800 sqft,5,4.0,\nApartment\nFully Furnished\n,,"Close to MRT and short drive to French, Austra...",Apartment,Fully Furnished,
3,704 Yishun Avenue 5,704 Yishun Avenue 5,1000,120 sqft,Room,,\nApartment\nFully Furnished\n,9 mins (700 m) to NS13 Yishun MRT,Room for 1 or 2 single ladies,Apartment,Fully Furnished,
4,Espada,48 Saint Thomas Walk,4300,689 sqft,1,1.0,\nApartment\nFully Furnished\n,6 mins (420 m) to NS23 Somerset MRT,All units virtual online viewing available! An...,Apartment,Fully Furnished,
...,...,...,...,...,...,...,...,...,...,...,...,...
5355,Avenue South Residence,13 Silat Avenue,7500,947 sqft,3,2.0,\nBungalow House\nPartially Furnished\n,,Newly TOP! Be the first to stay near town!,Bungalow House,Partially Furnished,
5356,453B Fernvale Road,453B Fernvale Road,999,150 sqft,Room,,\nBungalow House\nPartially Furnished\n,4 mins (290 m) to SW5 Fernvale LRT,Close to fernvale LRT,Bungalow House,Partially Furnished,
5357,Riviere,1 Jiak Kim Street,8000,840 sqft,2,2.0,\nBungalow House\nPartially Furnished\n,5 mins (370 m) to TE16 Havelock MRT,Brand new pool facing unit!,Bungalow House,Partially Furnished,
5358,Bukit Villas,1 Rasok Drive,7100,"3025 sqft (floor), 2256 sqft (land)",5,4.0,\nBungalow House\nPartially Furnished\n,,"Tranquil surroundings, facilities included! Po...",Bungalow House,Partially Furnished,
