In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import requests
from dotenv import load_dotenv
import os

# Load data

In [2]:
# Read data from csv
df = pd.read_csv("data/rental_prices_singapore.csv")

In [3]:
# Show dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5360 entries, 0 to 5359
Data columns (total 9 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   name                           5360 non-null   object 
 1   address                        5011 non-null   object 
 2   price                          5360 non-null   object 
 3   size                           5360 non-null   object 
 4   bedrooms                       5360 non-null   object 
 5   bathrooms                      4941 non-null   float64
 6   property_type_furnishing_year  5360 non-null   object 
 7   mrt_distance                   4641 non-null   object 
 8   agent_description              5360 non-null   object 
dtypes: float64(1), object(8)
memory usage: 377.0+ KB


In [4]:
# Show top five rows
df.head()

Unnamed: 0,name,address,price,size,bedrooms,bathrooms,property_type_furnishing_year,mrt_distance,agent_description
0,"Brand new Attic Studio, in a Peranakan Conserv...",Lorong 34 Geylang,3000,400 sqft,1,1.0,\nApartment\nFully Furnished\n,,One and only attic studio! Beautifully done up...
1,Astor,51C Lengkong Empat,2000,1130 sqft,Room,,\nApartment\nFully Furnished\n,11 mins (810 m) to DT28 Kaki Bukit MRT,Comes with In House Maid
2,Springhill Terrace,Sunrise avenue,7400,3800 sqft,5,4.0,\nApartment\nFully Furnished\n,,"Close to MRT and short drive to French, Austra..."
3,704 Yishun Avenue 5,704 Yishun Avenue 5,1000,120 sqft,Room,,\nApartment\nFully Furnished\n,9 mins (700 m) to NS13 Yishun MRT,Room for 1 or 2 single ladies
4,Espada,48 Saint Thomas Walk,4300,689 sqft,1,1.0,\nApartment\nFully Furnished\n,6 mins (420 m) to NS23 Somerset MRT,All units virtual online viewing available! An...


# Remove duplicates

In [6]:
# Diagnose duplicates
df.duplicated(subset=["name", "price", "size"]).value_counts()

True     3655
False    1705
dtype: int64

In [7]:
# Remove duplicates
df = df.drop_duplicates(subset=["name", "price", "size"]).copy()

In [8]:
# Show dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1705 entries, 0 to 5019
Data columns (total 9 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   name                           1705 non-null   object 
 1   address                        1577 non-null   object 
 2   price                          1705 non-null   object 
 3   size                           1705 non-null   object 
 4   bedrooms                       1705 non-null   object 
 5   bathrooms                      1536 non-null   float64
 6   property_type_furnishing_year  1705 non-null   object 
 7   mrt_distance                   1489 non-null   object 
 8   agent_description              1705 non-null   object 
dtypes: float64(1), object(8)
memory usage: 133.2+ KB


# Data enrichment: Fill in missing addresses

In [11]:
# Load environment variables from .env file
load_dotenv()

True

In [12]:
# Get Google Maps API key from .env 
google_maps_api_key = os.getenv("google_maps_api_key")

In [13]:
# Create function to get missing address based on the property name via Google Maps API
def get_missing_address(row):
    # Check if address is missing
    if pd.isna(row["address"]): 
        # Base URL for the Google Maps Find Place API
        base_url = "https://maps.googleapis.com/maps/api/place/findplacefromtext/json"

        # Parameters for the Find Place API request
        params = {
            "input": f"{row['name']}, Singapore",
            "inputtype": "textquery",
            "fields": "formatted_address",
            "key": google_maps_api_key
        }

        # Send Find Place API request and store the response
        response = requests.get(base_url, params=params)
        data = response.json()

        # Check if request was successful
        if data["status"] == "OK":
            # Extract address from the response
            address = data["candidates"][0]["formatted_address"]
        # If no address was found, give notification and use the original value (i.e. np.nan)
        else:
            print(f"No address found for {row['name']}")
            address = row["address"]
    # If an address is present, use that address
    else:
        address = row["address"]
    # Return address
    return address

In [14]:
# Apply function to get missing addresses and store them in "address_new" column
# Cost: 2.18$. More precisely, 0.017$ per API call for 128 missing addresses.
# df["address_new"] = df.apply(get_missing_address, axis=1)

No address found for Belmont/Morley Tropical Resort Style Good Class Bungalow with Pool
No address found for Renovated Detached within 1km of Nanyang Primary
No address found for ⭐ Spacious Unit For Rent In Orchard, 4 Ensuite Bedrooms! ⭐
No address found for 现代最新3个卧室顶层豪房出租Brand New 3 Bedroom duplex penthouse for Rent
No address found for Hill Top Good Class Bungalow Botanic Gardens/Chee Hoon
No address found for Colonial Cosy 2 Bedroom Landed for rent
No address found for Newly Renovated House for Rent in Fernhill Enclave
No address found for Flexible Monthly Rental. Serviced Apartment Near Telok Ayer Station (4 Mins Away)
No address found for Monthly Flexible  Rental Apartment Near Maxwell Station in Chinatown
No address found for Monthly Flexible  Rental Apartment Near Maxwell Station in Chinatown
No address found for See Video In Listing ! Goodman  Wilkinson- Lovely 6 Bedrms Semi detached Near Canadian School.
No address found for ❤️Embassy Class GCB Mansion Near Dempsey Village❤️
N

In [18]:
# Save enriched dataframe as csv
# df.to_csv("data/preprocessing/rental_prices_singapore_preprocessing_1.csv", index=False)

In [19]:
# Load enriched data
df = pd.read_csv("data/preprocessing/rental_prices_singapore_preprocessing_1.csv")

In [20]:
# Show dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1705 entries, 0 to 1704
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   name                           1705 non-null   object 
 1   address                        1577 non-null   object 
 2   price                          1705 non-null   object 
 3   size                           1705 non-null   object 
 4   bedrooms                       1705 non-null   object 
 5   bathrooms                      1536 non-null   float64
 6   property_type_furnishing_year  1705 non-null   object 
 7   mrt_distance                   1489 non-null   object 
 8   agent_description              1705 non-null   object 
 9   address_new                    1682 non-null   object 
dtypes: float64(1), object(9)
memory usage: 133.3+ KB


In [21]:
# Percent missing addresses before
print(f'Percent missing addresses before: {100 * pd.isna(df["address"]).sum() / pd.isna(df["address"]).count():.1f}%')

Percent missing addresses before: 7.5%


In [22]:
# Percent missing addresses after 
print(f'Percent missing addresses after: {100 * pd.isna(df["address_new"]).sum() / pd.isna(df["address_new"]).count():.1f}%')

Percent missing addresses after: 1.3%


In [24]:
# Delete the remaining 23 missing addresses
df.dropna(subset=["address_new"], inplace=True)

In [25]:
# Show dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1682 entries, 0 to 1704
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   name                           1682 non-null   object 
 1   address                        1577 non-null   object 
 2   price                          1682 non-null   object 
 3   size                           1682 non-null   object 
 4   bedrooms                       1682 non-null   object 
 5   bathrooms                      1514 non-null   float64
 6   property_type_furnishing_year  1682 non-null   object 
 7   mrt_distance                   1473 non-null   object 
 8   agent_description              1682 non-null   object 
 9   address_new                    1682 non-null   object 
dtypes: float64(1), object(9)
memory usage: 144.5+ KB


# Feature engineering

## Latitude and longitude

In [27]:
# Create function to get latitude and longitude from an address 
def get_latitude_longitude(address):
    # Base URL for the Google Maps Geocoding API
    base_url = "https://maps.googleapis.com/maps/api/geocode/json"
    
    # Parameters for the Geocoding API request
    params = {
        "address": f"{address}, Singapore",
        "key": google_maps_api_key
    }
    
    # Send Geocoding API request and store the response
    response = requests.get(base_url, params=params)
    data = response.json()
    
    # Check if request was successful
    if data["status"] == "OK":
        # Extract latitude and longitude from the response
        location = data["results"][0]["geometry"]["location"]
        latitude = location["lat"]
        longitude = location["lng"]
    else:
        # Assign missing values and print error message if the request failed
        latitude = np.nan
        longitude = np.nan
        print(f"Geocoding request failed for {address}")
    
    # Return latitude and longitude
    return (latitude, longitude)

In [28]:
# Apply function to create latitude and longitude column 
# Cost: 8.41$. More precisely, 0.005$ per API call for 1682 addresses.
# df[["latitude", "longitude"]] = df["address_new"].apply(get_latitude_longitude).apply(pd.Series)

Geocoding request failed for 111 Tampines Road
Geocoding request failed for 28 Leonie Hill


In [29]:
# Save enriched dataframe as csv
# df.to_csv("data/preprocessing/rental_prices_singapore_preprocessing_2.csv", index=False)

In [30]:
# Load enriched data
df = pd.read_csv("data/preprocessing/rental_prices_singapore_preprocessing_2.csv")

In [31]:
# Show dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   name                           1682 non-null   object 
 1   address                        1577 non-null   object 
 2   price                          1682 non-null   object 
 3   size                           1682 non-null   object 
 4   bedrooms                       1682 non-null   object 
 5   bathrooms                      1514 non-null   float64
 6   property_type_furnishing_year  1682 non-null   object 
 7   mrt_distance                   1473 non-null   object 
 8   agent_description              1682 non-null   object 
 9   address_new                    1682 non-null   object 
 10  latitude                       1680 non-null   float64
 11  longitude                      1680 non-null   float64
dtypes: float64(3), object(9)
memory usage: 157.8+ KB

## Meters to central business district

In [33]:
# Create function to get meters to central business district 
def get_meters_to_cbd(row):
    # Get latitude and longitude of the property
    property_latitude = row["latitude"]
    property_longitude = row["longitude"]
    
    # Return a missing value if latitude or longitude is missing
    if np.isnan(property_latitude) or np.isnan(property_longitude):
        print(f"Property latitude or longitude missing for {row['address_new']}")
        return np.nan
    
    # Latitude and longitude of central business district (i.e. Raffles Place)
    cbd_latitude = 1.284184
    cbd_longitude = 103.85151

    # Base URL for the Google Maps Distance Matrix API
    base_url = "https://maps.googleapis.com/maps/api/distancematrix/json"
    
    # Parameters for the Distance Matrix API request
    params = {
        "origins": f"{property_latitude},{property_longitude}",
        "destinations": f"{cbd_latitude},{cbd_longitude}",
        "key": google_maps_api_key
    }

    # Send the Distance Matrix API request and store the response
    response = requests.get(base_url, params=params)
    data = response.json()

    # Process the response to get the distance
    if "rows" in data and data["rows"]:
        meters_to_cbd = data["rows"][0]["elements"][0]["distance"]["value"]
        print(f"Distance between property and CBD: {meters_to_cbd} meters")
    else:
        print("No distance information available.")
        return np.nan
    return meters_to_cbd

In [34]:
# Apply function to create the "meters_to_cbd" column 
# Cost: 8.4$. More precisely, 0.005$ per distance for 1680 distances.
# df["meters_to_cbd"] = df.apply(get_meters_to_cbd, axis=1)

Distance between property and CBD: 6744 meters
Distance between property and CBD: 14317 meters
Distance between property and CBD: 15497 meters
Distance between property and CBD: 22825 meters
Distance between property and CBD: 3070 meters
Distance between property and CBD: 4366 meters
Distance between property and CBD: 13990 meters
Distance between property and CBD: 14464 meters
Distance between property and CBD: 11045 meters
Distance between property and CBD: 6559 meters
Distance between property and CBD: 7735 meters
Distance between property and CBD: 1393 meters
Distance between property and CBD: 13491 meters
Distance between property and CBD: 23762 meters
Distance between property and CBD: 17661 meters
Distance between property and CBD: 23438 meters
Distance between property and CBD: 20684 meters
Distance between property and CBD: 15839 meters
Distance between property and CBD: 3646 meters
Distance between property and CBD: 16588 meters
Distance between property and CBD: 8424 meters


Distance between property and CBD: 4890 meters
Distance between property and CBD: 5525 meters
Distance between property and CBD: 2466 meters
Distance between property and CBD: 9753 meters
Distance between property and CBD: 17709 meters
Distance between property and CBD: 4077 meters
Distance between property and CBD: 4288 meters
Distance between property and CBD: 20081 meters
Distance between property and CBD: 4451 meters
Distance between property and CBD: 15143 meters
Distance between property and CBD: 15243 meters
Distance between property and CBD: 8357 meters
Distance between property and CBD: 8810 meters
Distance between property and CBD: 15856 meters
Distance between property and CBD: 15751 meters
Distance between property and CBD: 7334 meters
Distance between property and CBD: 16525 meters
Distance between property and CBD: 28097 meters
Distance between property and CBD: 8586 meters
Distance between property and CBD: 7200 meters
Distance between property and CBD: 5229 meters
Dista

Distance between property and CBD: 13041 meters
Distance between property and CBD: 5255 meters
Distance between property and CBD: 2008 meters
Distance between property and CBD: 9144 meters
Distance between property and CBD: 8502 meters
Distance between property and CBD: 24818 meters
Distance between property and CBD: 13955 meters
Distance between property and CBD: 6223 meters
Distance between property and CBD: 7934 meters
Distance between property and CBD: 14952 meters
Distance between property and CBD: 6355 meters
Distance between property and CBD: 15143 meters
Distance between property and CBD: 11448 meters
Distance between property and CBD: 11568 meters
Distance between property and CBD: 8357 meters
Distance between property and CBD: 14781 meters
Distance between property and CBD: 5150 meters
Distance between property and CBD: 6929 meters
Distance between property and CBD: 11913 meters
Distance between property and CBD: 2087 meters
Distance between property and CBD: 4288 meters
Dist

Distance between property and CBD: 10075 meters
Distance between property and CBD: 16466 meters
Distance between property and CBD: 11691 meters
Distance between property and CBD: 5408 meters
Distance between property and CBD: 5408 meters
Distance between property and CBD: 5408 meters
Distance between property and CBD: 5408 meters
Distance between property and CBD: 4890 meters
Distance between property and CBD: 4890 meters
Distance between property and CBD: 4890 meters
Distance between property and CBD: 13434 meters
Distance between property and CBD: 5707 meters
Distance between property and CBD: 13251 meters
Distance between property and CBD: 18117 meters
Distance between property and CBD: 12025 meters
Distance between property and CBD: 4282 meters
Distance between property and CBD: 6972 meters
Distance between property and CBD: 19715 meters
Distance between property and CBD: 4702 meters
Distance between property and CBD: 13491 meters
Distance between property and CBD: 1308 meters
Dist

Distance between property and CBD: 13251 meters
Distance between property and CBD: 9833 meters
Distance between property and CBD: 20254 meters
Distance between property and CBD: 13253 meters
Distance between property and CBD: 11751 meters
Distance between property and CBD: 12856 meters
Distance between property and CBD: 21614 meters
Distance between property and CBD: 5626 meters
Distance between property and CBD: 19073 meters
Distance between property and CBD: 2415 meters
Distance between property and CBD: 2539 meters
Distance between property and CBD: 2008 meters
Distance between property and CBD: 1204 meters
Distance between property and CBD: 3034 meters
Distance between property and CBD: 7122 meters
Distance between property and CBD: 3261 meters
Distance between property and CBD: 16842 meters
Distance between property and CBD: 16559 meters
Distance between property and CBD: 17706 meters
Distance between property and CBD: 5616 meters
Distance between property and CBD: 3034 meters
Dis

Distance between property and CBD: 11288 meters
Distance between property and CBD: 5410 meters
Distance between property and CBD: 21449 meters
Distance between property and CBD: 11416 meters
Distance between property and CBD: 9123 meters
Distance between property and CBD: 12791 meters
Distance between property and CBD: 17391 meters
Distance between property and CBD: 14253 meters
Distance between property and CBD: 23458 meters
Distance between property and CBD: 1354 meters
Distance between property and CBD: 16127 meters
Distance between property and CBD: 2519 meters
Distance between property and CBD: 13625 meters
Distance between property and CBD: 15039 meters
Distance between property and CBD: 6637 meters
Distance between property and CBD: 12190 meters
Distance between property and CBD: 16877 meters
Distance between property and CBD: 8268 meters
Distance between property and CBD: 7294 meters
Distance between property and CBD: 6567 meters
Distance between property and CBD: 20712 meters


Distance between property and CBD: 2997 meters
Distance between property and CBD: 15839 meters
Distance between property and CBD: 10042 meters
Distance between property and CBD: 6044 meters
Distance between property and CBD: 9657 meters
Distance between property and CBD: 11462 meters
Distance between property and CBD: 4210 meters
Distance between property and CBD: 2383 meters
Distance between property and CBD: 3865 meters
Distance between property and CBD: 11695 meters
Distance between property and CBD: 3010 meters
Distance between property and CBD: 11826 meters
Distance between property and CBD: 2867 meters
Distance between property and CBD: 2867 meters
Distance between property and CBD: 23090 meters
Distance between property and CBD: 2901 meters
Distance between property and CBD: 18748 meters
Distance between property and CBD: 15852 meters
Distance between property and CBD: 7832 meters
Distance between property and CBD: 1204 meters
Distance between property and CBD: 6488 meters
Dista

Distance between property and CBD: 2185 meters
Distance between property and CBD: 2473 meters
Distance between property and CBD: 2927 meters
Distance between property and CBD: 13133 meters
Distance between property and CBD: 13251 meters
Distance between property and CBD: 13491 meters
Distance between property and CBD: 6720 meters
Distance between property and CBD: 13733 meters
Distance between property and CBD: 13251 meters
Distance between property and CBD: 13974 meters
Distance between property and CBD: 2927 meters
Distance between property and CBD: 15218 meters
Distance between property and CBD: 13129 meters
Distance between property and CBD: 7463 meters
Distance between property and CBD: 19522 meters
Distance between property and CBD: 5739 meters
Distance between property and CBD: 2482 meters
Distance between property and CBD: 3246 meters
Distance between property and CBD: 23450 meters
Distance between property and CBD: 3756 meters
Distance between property and CBD: 3750 meters
Dis

Distance between property and CBD: 13251 meters
Distance between property and CBD: 12469 meters
Distance between property and CBD: 3010 meters
Distance between property and CBD: 8359 meters
Distance between property and CBD: 12617 meters
Distance between property and CBD: 13960 meters
Distance between property and CBD: 10407 meters
Distance between property and CBD: 20229 meters
Distance between property and CBD: 11526 meters
Distance between property and CBD: 2314 meters
Distance between property and CBD: 16010 meters
Distance between property and CBD: 2663 meters
Distance between property and CBD: 8177 meters
Distance between property and CBD: 13703 meters
Distance between property and CBD: 13048 meters
Distance between property and CBD: 3405 meters
Distance between property and CBD: 3395 meters
Distance between property and CBD: 13251 meters
Distance between property and CBD: 23822 meters
Distance between property and CBD: 15039 meters
Distance between property and CBD: 19007 meters

Distance between property and CBD: 7212 meters
Distance between property and CBD: 21584 meters
Distance between property and CBD: 14745 meters
Distance between property and CBD: 18739 meters
Distance between property and CBD: 20134 meters
Distance between property and CBD: 7878 meters
Distance between property and CBD: 9556 meters
Distance between property and CBD: 5963 meters
Distance between property and CBD: 8261 meters
Distance between property and CBD: 3735 meters
Distance between property and CBD: 12731 meters
Distance between property and CBD: 1393 meters
Distance between property and CBD: 2773 meters
Distance between property and CBD: 14112 meters
Distance between property and CBD: 12190 meters
Distance between property and CBD: 16032 meters
Distance between property and CBD: 6393 meters
Distance between property and CBD: 11595 meters
Distance between property and CBD: 8531 meters
Distance between property and CBD: 3937 meters
Distance between property and CBD: 13491 meters
Dis

In [35]:
# Save enriched dataframe as csv
# df.to_csv("data/preprocessing/rental_prices_singapore_preprocessing_3.csv", index=False)

In [36]:
# Load enriched data
df = pd.read_csv("data/preprocessing/rental_prices_singapore_preprocessing_3.csv")

In [37]:
# Show dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   name                           1682 non-null   object 
 1   address                        1577 non-null   object 
 2   price                          1682 non-null   object 
 3   size                           1682 non-null   object 
 4   bedrooms                       1682 non-null   object 
 5   bathrooms                      1514 non-null   float64
 6   property_type_furnishing_year  1682 non-null   object 
 7   mrt_distance                   1473 non-null   object 
 8   agent_description              1682 non-null   object 
 9   address_new                    1682 non-null   object 
 10  latitude                       1680 non-null   float64
 11  longitude                      1680 non-null   float64
 12  meters_to_cbd                  1680 non-null   f

## Meters to school

In [None]:
# Create function to get latitude and longitude of the closest school 
def get_school_location(row):
    # Get latitude and longitude of the property
    property_latitude = row["latitude"]
    property_longitude = row["longitude"]
    
    # Base URL for the Google Maps Places Nearby Search API
    base_url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"

    # Parameters for the Nearby Search API request
    params = {
        "location": f"{property_latitude},{property_longitude}",
        "radius": 1000,  # Search radius in meters
        "type": "school",
        "key": google_maps_api_key
    }

    # Send the Nearby Search API request and store the response
    response = requests.get(base_url, params=params)
    data = response.json()
    
    # Extract latitude and longitude of the closest school from the response
    if "results" in data and data["results"]:
        closest_school = data["results"][0]
        school_name = closest_school["name"]
        school_location = closest_school["geometry"]["location"]
        school_latitude = school_location["lat"]
        school_longitude = school_location["lng"]
        print(f"Closest school: {school_name}")
        print(f"Latitude: {school_latitude}, Longitude: {school_longitude}")
    else:
        school_latitude = np.nan
        school_longitude = np.nan
        print("No schools found nearby.")
    return (school_latitude, school_longitude)

In [None]:
# Apply function to create the "school_location" column (cost: ~160$)
# df["school_location"] = df.apply(get_school_location, axis=1)

In [None]:
# Save enriched dataframe as csv
# df.to_csv("data/rental_prices_singapore_3.csv", index=False)

In [None]:
# Load enriched data
df = pd.read_csv("data/rental_prices_singapore_3.csv")

In [None]:
# Show dataframe info
df.info()

In [None]:
# Create function to get meters to the closest school 
def get_meters_to_school(row):
    # Get latitude and longitude of the property
    property_latitude = row["latitude"]
    property_longitude = row["longitude"]
    
    # Get latitude and longitude of the school
    if pd.isna(row["school_location"]):
        return np.nan
    # Remove parentheses and split the string by comma
    latitude_str, longitude_str = row["school_location"].strip("()").split(",")
    # Convert the strings to float values
    school_latitude = float(latitude_str)
    school_longitude = float(longitude_str)

    # Base URL for the Google Maps Distance Matrix API
    base_url = "https://maps.googleapis.com/maps/api/distancematrix/json"
    
    # Parameters for the Distance Matrix API request
    params = {
        "origins": f"{property_latitude},{property_longitude}",
        "destinations": f"{school_latitude},{school_longitude}",
        "key": google_maps_api_key
    }

    # Send the Distance Matrix API request and store the response
    response = requests.get(base_url, params=params)
    data = response.json()

    # Process the response to get the distance
    if "rows" in data and data["rows"]:
        meters_to_school = data["rows"][0]["elements"][0]["distance"]["value"]
        print(f"Distance between property and closest school: {meters_to_school} meters")
    else:
        print("No distance information available.")
        return np.nan
    return meters_to_school

In [None]:
# Apply function to create the "meters_to_school" column (cost: ~25$)
# df["meters_to_school"] = df.apply(get_meters_to_school, axis=1)

In [None]:
# Save enriched dataframe as csv
# df.to_csv("data/rental_prices_singapore_4.csv", index=False)

In [None]:
# Load enriched data
df = pd.read_csv("data/rental_prices_singapore_4.csv")

In [None]:
# Show dataframe info
df.info()

## Good restaurants nearby

In [None]:
# Create function to get the average Google Maps rating of nearby restaurants 
def get_restaurants_rating(row):
    # Get latitude and longitude of the property
    property_latitude = row["latitude"]
    property_longitude = row["longitude"]
    
    # Base URL for the Google Maps Places Nearby Search API
    base_url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"

    # Parameters for the Nearby Search API request
    params = {
        "location": f"{property_latitude},{property_longitude}",
        "radius": 1000,  # Search radius in meters
        "type": "restaurant",
        "key": google_maps_api_key
    }

    # Send the Nearby Search API request and store the response
    response = requests.get(base_url, params=params)
    data = response.json()
    
    # Process the response to get the average restaurant rating
    if "results" in data and data["results"]:
        # Extract restaurant ratings as a list, assigning np.nan for missing ratings
        rating_list = [restaurant.get("rating", np.nan) for restaurant in data.get("results")]
        # Calculate average rating, ignoring np.nan values
        average_rating = np.nanmean(rating_list)
        print(f"Number of restaurants: {len(rating_list)}")
        print(f"Number of ratings: {len([rating for rating in rating_list if not np.isnan(rating)])}")
        print(f"Average rating: {average_rating:.2f}")
    else:
        print("No restaurants found nearby.")
        return np.nan
    return average_rating

In [None]:
# Apply function to create the "restaurants_rating" column (cost: ~160$)
# df["restaurants_rating"] = df.apply(get_restaurants_rating, axis=1)

In [None]:
# Save enriched dataframe as csv
# df.to_csv("data/rental_prices_singapore_7.csv", index=False)

In [None]:
# Load enriched data
# df = pd.read_csv("data/rental_prices_singapore_7.csv")

In [None]:
# Show dataframe info
df.info()

# Feature extraction

## Property type

In [None]:
# Create function to extract property type 
def extract_type(string):
    if "Condominium" in string:
        return "Condominium"
    elif "Apartment" in string:
        return "Apartment"
    elif "HDB Flat" in string:
        return "HDB Flat"
    elif "Semi-Detached House" in string:
        return "Semi-Detached House"
    elif "Good Class Bungalow" in string:
        return "Good Class Bungalow"
    elif "Corner Terrace" in string:
        return "Corner Terrace"
    elif "Detached House" in string:
        return "Detached House"
    elif "Executive Condominium" in string:
        return "Executive Condominium"
    elif "Terraced House" in string:
        return "Terraced House"
    elif "Bungalow House" in string:
        return "Bungalow House"
    elif "Cluster House" in string:
        return "Cluster House"
    else:
        return np.nan

In [None]:
# Apply function to create property type column
df["property_type"] = df["property_type_furnishing_year"].apply(extract_type)

In [None]:
# Frequencies of property types
df["property_type"].value_counts()

## Furnishing

In [None]:
# Create function to extract information about furnishing  
def extract_furnishing(string):
    if "Fully Furnished" in string:
        return "Fully Furnished"
    elif "Partially Furnished" in string:
        return "Partially Furnished"
    elif "Unfurnished" in string:
        return "Unfurnished"
    else:
        return np.nan

In [None]:
# Apply function to create furnishing column
df["furnishing"] = df["property_type_furnishing_year"].apply(extract_furnishing)

In [None]:
# Frequencies of furnishing
df["furnishing"].value_counts()

## Built year

In [None]:
# Create function to extract built year 
def extract_year(string):
    year = re.search(r"\b\d{4}\b", string)
    if year:
        return year.group()
    else:
        return np.nan

In [None]:
# Apply function to create built year column
df["year"] = df["property_type_furnishing_year"].apply(extract_year).astype("Int32")

In [None]:
# Descriptive statistics of built year
df["year"].describe()

## Meters to MRT

In [None]:
# Extract MRT distance in meters
df["meters_to_mrt"] = df["mrt_distance"].str.split(r"m\)").str[0].str.split(r"\(").str[1].astype("Int32")

## Agent description

Identify features to be extracted from the agent description by using a word cloud to visualize the most frequent words.

In [None]:
from wordcloud import WordCloud

In [None]:
# Combine all agent descriptions into a single string
text = " ".join(df["agent_description"])

# Create a WordCloud object
wordcloud = WordCloud(width=800, height=400, background_color="white", random_state=7)

# Create a word cloud of the agent descriptions
wordcloud.generate(text)

# Display the word cloud 
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")  # Turn off the axis
plt.show()

In [None]:
# Save the word cloud as an image
# wordcloud.to_file("images/wordcloud.png")

Extract the following features: (1) High floor, (2) new, (3) renovated, (4) view and (5) penthouse.

## High floor

In [None]:
# Extract high floor information
df["high_floor"] = df["agent_description"].apply(lambda string: True if "high floor" in string.lower() else False)

In [None]:
# Frequency of high floor
df["high_floor"].value_counts()

## New

In [None]:
# Extract new feature
df["new"] = df["agent_description"].apply(lambda string: True if "brand new" in string.lower() or 
                                          "new unit" in string.lower() else False)

In [None]:
# Frequency of new
df["new"].value_counts()

## Renovated

In [None]:
# Extract renovation information
df["renovated"] = df["agent_description"].apply(lambda string: True if "renovated" in string.lower() or 
                                                "renovation" in string.lower() else False)

In [None]:
# Frequency of renovated
df["renovated"].value_counts()

## View

In [None]:
# Extract view feature
df["view"] = df["agent_description"].apply(lambda string: True if "sea view" in string.lower() 
                                           or "seaview" in string.lower() or "panoramic view" in string.lower()
                                           or "unblocked view" in string.lower() or "unblock view" in string.lower()
                                           or "stunning view" in string.lower() or "park view" in string.lower() 
                                           or "breathtaking view" in string.lower() or "river view" in string.lower() 
                                           or "pool view" in string.lower() or "spectacular view" in string.lower()                                          
                                           or "city view" in string.lower() or "greenery view" in string.lower() 
                                           or "gorgeous view" in string.lower() else False)

In [None]:
# Frequency of view
df["view"].value_counts()

## Penthouse

In [None]:
# Extract penthouse information
df["penthouse"] = df["agent_description"].apply(lambda string: True if "penthouse" in string.lower() else False)

In [None]:
# Frequency of penthouse
df["penthouse"].value_counts()

# Convert data types

In [None]:
# Convert price from str to int
df["price"] = df["price"].str.replace(",", "").astype("Int32")

In [None]:
# Convert size (sqft) from str to int
df["size"] = df["size"].str.split("sqft").str[0].astype("Int32")

In [None]:
# Convert bathrooms from str to int
df["bathrooms"] = df["bathrooms"].astype("Int32")

In [None]:
# Convert meters_to_school from float to int
df["meters_to_school"] = df["meters_to_school"].astype("Int32")

# Handle missing values

## Bathrooms

In [None]:
# Average number of bathrooms by number of bedrooms
np.round(df["bathrooms"].groupby(df["bedrooms"]).mean(), 1)

In [None]:
# Assume 1 bathroom for a studio
df["bathrooms"][df["bedrooms"]=="Studio"] = 1
# Assume 1 bathroom for a room in a shared flat
df["bathrooms"][df["bedrooms"]=="Room"] = 1
# Else assume the same number as bedrooms
df["bathrooms"][pd.isna(df["bathrooms"])] = df["bedrooms"]

## Latitude and longitude

In [None]:
# Delete rows where latitude or longitude is missing 
df.dropna(subset=["latitude", "longitude"], how="any", inplace=True)

## Meters to school

In [None]:
# Descriptive statistics of meters to school
df["meters_to_school"].describe()

In [None]:
# Impute the maximum value if meters to school are missing
max_meters_to_school = df["meters_to_school"].max()
df["meters_to_school"] = df["meters_to_school"].fillna(max_meters_to_school)

## Meters to MRT

In [None]:
# Descriptive statistics of meters to MRT
df["meters_to_mrt"].describe()

In [None]:
# Impute the median if meters to MRT are missing
median_meters_to_mrt = df["meters_to_mrt"].median()
df["meters_to_mrt"] = df["meters_to_mrt"].fillna(median_meters_to_mrt)

## Furnishing

In [None]:
# Frequencies of furnishing
df["furnishing"].value_counts()

In [None]:
# Impute the mode if furnishing information is missing
mode_furnishing = df["furnishing"].mode()[0]
df["furnishing"] = df["furnishing"].fillna(mode_furnishing)

## Built year

In [None]:
# Descriptive statistics of built year
df["year"].describe()

In [None]:
# Impute the median if built year is missing
median_year = df["year"].median()
df["year"] = df["year"].fillna(median_year)

# Exploratory data analysis

## Numerical columns

In [None]:
# Store numerical columns
numerical_columns = ["price", "size", "bathrooms", "meters_to_mrt", "meters_to_school", "year"]

### Table: Descriptive statistics 

In [None]:
# Overview of descriptive statistics of all numerical columns
df[numerical_columns].describe().transpose()

### Price

In [None]:
# Descriptive statistics
df["price"].describe()

In [None]:
# Create a histogram 
sns.histplot(df["price"], bins=50)

# Add axes labels 
plt.xlabel("Price (in S$)")
plt.ylabel("Frequency")

# Set x-axis limits
# plt.xlim(0, 15000)

# Add title
plt.title("Monthly Rental Price")

# Show the plot
plt.show()

#### Price correlations

In [None]:
# Correlations between price and all numerical and dichotomous columns
df[["price", "size", "bathrooms", "meters_to_mrt", "meters_to_school", "year", "high_floor", "new", "renovated", 
    "view", "penthouse"]].corr()["price"]

#### Price scatterplots

In [None]:
# Set the figure size
plt.figure(figsize=(12, 12))

# Iterate over the five numerical columns
for i, col in enumerate(["size", "bathrooms", "meters_to_mrt", "meters_to_school", "year"]):
    # Create a subplot in a 2x3 grid, the current subplot is i+1
    plt.subplot(3, 2, i + 1)
    # Create a scatter plot between the current column and price
    sns.scatterplot(data=df, x=col, y="price")
    # Add axis labels
    plt.ylabel("Price in S$")
    plt.xlabel(f"{col}")
    # Add title
    plt.title(f"Price by {col}")

# Adjust subplot layout
plt.tight_layout()

# Show the plot
plt.show()

#### Price by bedrooms

In [None]:
# Descriptive statistics of price by bedrooms
price_by_bedrooms = df["price"].groupby(df["bedrooms"])
price_by_bedrooms.describe()

In [None]:
# Store median price by bedrooms in the desired order
median_price_by_bedrooms = price_by_bedrooms.median().reindex(["Room", "Studio", "1", "2", "3", "4", "5", "6", "7", 
                                                              "8", "9"])

In [None]:
# Bar plot
sns.barplot(x=median_price_by_bedrooms.index, y=median_price_by_bedrooms.values, palette="colorblind")

# Add axes labels 
plt.ylabel("Price in S$")
plt.xlabel("Bedrooms")

# Add title
plt.title("Median price by bedrooms")

# Show the plot
plt.show()

#### Price by property type

In [None]:
# Descriptive statistics of price by property type
price_by_type = df["price"].groupby(df["property_type"])
price_by_type.describe()

In [None]:
# Store median price by property type in the desired order
median_price_by_type = price_by_type.median().reindex(["HDB Flat", "Apartment", "Condominium", "Corner Terrace",
                                                      "Semi-Detached House", "Good Class Bungalow", "Cluster House",
                                                      "Terraced House", "Detached House"])
median_price_by_type

In [None]:
# Bar plot
sns.barplot(x=median_price_by_type.index, y=median_price_by_type.values, palette="colorblind")

# Add axes labels
plt.xlabel("Property type")
plt.ylabel("Price in S$")

# Rotate x-axis tick labels by 45 degrees
plt.xticks(rotation=45)

# Add title
plt.title("Median price by property type")

# Show the plot
plt.show()

#### Price by furnishing

In [None]:
# Descriptive statistics of price by furnishing
price_by_furnishing = df["price"].groupby(df["furnishing"])
price_by_furnishing.describe()

In [None]:
# Store median price by furnishing
median_price_by_furnishing = price_by_furnishing.median()

In [None]:
# Bar plot
sns.barplot(x=median_price_by_furnishing.index, y=median_price_by_furnishing.values, palette="colorblind")

# Add axes labels
plt.ylabel("Price in S$")
plt.xlabel("")

# Add title
plt.title("Median price by furnishing")

# Show the plot
plt.show()

#### Price by high floor, new, renovated, view and penthouse

In [None]:
# Descriptive statistics of price by high floor, new, renovated, view and penthouse
for col in ["high_floor", "new", "renovated", "view", "penthouse"]:
    print(f"Price by {col}:")
    print(round(df["price"].groupby(df[col]).describe()))
    print("-" * 80)

In [None]:
# Bar plots of price by high floor, new, renovated, view and penthouse
# Set the figure size
plt.figure(figsize=(12, 6))

# Iterate over the five dichotomous columns
for i, col in enumerate(["high_floor", "new", "renovated", "view", "penthouse"]):
    # Create a subplot in a 2x3 grid, the current subplot is i+1
    plt.subplot(2, 3, i + 1)
    # Create a bar plot of median price by the current column
    ax = sns.barplot(data=df, x=col, y="price", estimator=np.median, ci=None)
    # Add axes labels
    plt.ylabel("Price in S$")
    plt.xlabel("")
    # Set y-axis limits
    ax.set_ylim(0, 13000)
    # Add title
    plt.title(f"Median price by {col}")

# Adjust subplot layout
plt.tight_layout()

# Show the plot
plt.show()

### Size

In [None]:
# Descriptive statistics
df["size"].describe()

In [None]:
# Create a histogram 
sns.histplot(df["size"], bins=30)

# Add axes labels 
plt.xlabel("Size (in sqft)")
plt.ylabel("Frequency")

# Set x-axis limits
# plt.xlim(0, 3900)

# Add title
plt.title("Property Size")

# Show the plot
plt.show()

### Bathrooms

In [None]:
# Descriptive statistics
df["bathrooms"].describe()

In [None]:
# Create a histogram 
sns.histplot(df["bathrooms"])

# Add axes labels 
plt.xlabel("Number of bathrooms")
plt.ylabel("Frequency")

# Add title
plt.title("Bathrooms")

# Show the plot
plt.show()

### Meters to MRT

In [None]:
# Descriptive statistics
df["meters_to_mrt"].describe()

In [None]:
# Create a histogram 
sns.histplot(df["meters_to_mrt"])

# Add axes labels 
plt.xlabel("Meters to MRT")
plt.ylabel("Frequency")

# Add title
plt.title("Meters to MRT")

# Show the plot
plt.show()

### Meters to school

In [None]:
# Descriptive statistics
df["meters_to_school"].describe()

In [None]:
# Create a histogram 
sns.histplot(df["meters_to_school"])

# Add axes labels 
plt.xlabel("Meters to school")
plt.ylabel("Frequency")

# Add title
plt.title("Meters to school")

# Show the plot
plt.show()

### Built year

In [None]:
# Descriptive statistics
df["year"].describe()

In [None]:
# Create a histogram 
sns.histplot(df["year"])

# Set x-axis markers every 5 years
plt.xticks(range(1975, 2026, 5))

# Add axes labels 
plt.xlabel("Built year")
plt.ylabel("Frequency")

# Add title
plt.title("Built year")

# Show the plot
plt.show()

## Categorical columns

In [None]:
# Store categorical columns
categorical_columns = ["bedrooms", "property_type", "furnishing", "high_floor", "new", "renovated", "view", 
                       "penthouse"]

### Bedrooms

In [None]:
# Calculate frequencies
bedrooms_freq = df["bedrooms"].value_counts()

# Reorder categories
bedrooms_freq = bedrooms_freq.reindex(["Room", "Studio", "1", "2", "3", "4", "5", "6", "7", "8", "9"])

# Display frequencies
print("Absolute frequencies:")
print(bedrooms_freq)

print("\nRelative frequencies:")
for category, frequency in bedrooms_freq.iteritems():
    print(f"{category}: {100 * frequency/bedrooms_freq.sum():.1f}%")

In [None]:
# Bar plot
sns.barplot(x=bedrooms_freq.index, y=bedrooms_freq.values, palette="colorblind")

# Add axes labels 
plt.ylabel("Frequency")

# Add title
plt.title("Bedrooms")

# Show the plot
plt.show()

### Property type

In [None]:
# Calculate frequencies
property_type_freq = df["property_type"].value_counts()

# Display frequencies
print("Absolute frequencies:")
print(property_type_freq)

print("\nRelative frequencies:")
for category, frequency in property_type_freq.iteritems():
    print(f"{category}: {100 * frequency/property_type_freq.sum():.1f}%")

In [None]:
# Bar plot
sns.barplot(x=property_type_freq.index, y=property_type_freq.values, palette="colorblind")

# Add axes labels 
plt.ylabel("Frequency")

# Rotate x-axis tick labels by 45 degrees
plt.xticks(rotation=45)

# Add title
plt.title("Property type")

# Show the plot
plt.show()

### Furnishing

In [None]:
# Calculate frequencies
furnishing_freq = df["furnishing"].value_counts()

# Display frequencies
print("Absolute frequencies:")
print(furnishing_freq)

print("\nRelative frequencies:")
for category, frequency in furnishing_freq.iteritems():
    print(f"{category}: {100 * frequency/furnishing_freq.sum():.1f}%")

In [None]:
# Bar plot
sns.barplot(x=furnishing_freq.index, y=furnishing_freq.values, palette="colorblind")

# Add axes labels 
plt.ylabel("Frequency")

# Add title
plt.title("Furnishing")

# Show the plot
plt.show()

### High floor

In [None]:
# Frequencies
print("Absolute frequencies:")
print(df["high_floor"].value_counts())

print("\nRelative frequencies:")
print(f'False: {np.round(100 * df["high_floor"].value_counts()[False] / df["high_floor"].value_counts().sum(), 1)}%')
print(f'True: {np.round(100 * df["high_floor"].value_counts()[True] / df["high_floor"].value_counts().sum(), 1)}%')

In [None]:
# Bar plot
sns.barplot(x=df["high_floor"].value_counts().index, y=df["high_floor"].value_counts().values)

# Add axes labels 
plt.ylabel("Frequency")

# Add title
plt.title("High floor")

# Show the plot
plt.show()

### New

In [None]:
# Frequencies
print("Absolute frequencies:")
print(df["new"].value_counts())

print("\nRelative frequencies:")
print(f'False: {np.round(100 * df["new"].value_counts()[False] / df["new"].value_counts().sum(), 1)}%')
print(f'True: {np.round(100 * df["new"].value_counts()[True] / df["new"].value_counts().sum(), 1)}%')

In [None]:
# Bar plot
sns.barplot(x=df["new"].value_counts().index, y=df["new"].value_counts().values)

# Add axes labels 
plt.ylabel("Frequency")

# Add title
plt.title("New")

# Show the plot
plt.show()

### Renovated

In [None]:
# Frequencies
print("Absolute frequencies:")
print(df["renovated"].value_counts())

print("\nRelative frequencies:")
print(f'False: {np.round(100 * df["renovated"].value_counts()[False] / df["renovated"].value_counts().sum(), 1)}%')
print(f'True: {np.round(100 * df["renovated"].value_counts()[True] / df["renovated"].value_counts().sum(), 1)}%')

In [None]:
# Bar plot
sns.barplot(x=df["renovated"].value_counts().index, y=df["renovated"].value_counts().values)

# Add axes labels 
plt.ylabel("Frequency")

# Add title
plt.title("Renovated")

# Show the plot
plt.show()

### View

In [None]:
# Frequencies
print("Absolute frequencies:")
print(df["view"].value_counts())

print("\nRelative frequencies:")
print(f'False: {np.round(100 * df["view"].value_counts()[False] / df["view"].value_counts().sum(), 1)}%')
print(f'True: {np.round(100 * df["view"].value_counts()[True] / df["view"].value_counts().sum(), 1)}%')

In [None]:
# Bar plot
sns.barplot(x=df["view"].value_counts().index, y=df["view"].value_counts().values)

# Add axes labels 
plt.ylabel("Frequency")

# Add title
plt.title("View")

# Show the plot
plt.show()

### Penthouse

In [None]:
# Frequencies
print("Absolute frequencies:")
print(df["penthouse"].value_counts())

print("\nRelative frequencies:")
print(f'False: {np.round(100 * df["penthouse"].value_counts()[False] / df["penthouse"].value_counts().sum(), 1)}%')
print(f'True: {np.round(100 * df["penthouse"].value_counts()[True] / df["penthouse"].value_counts().sum(), 1)}%')

In [None]:
# Bar plot
sns.barplot(x=df["penthouse"].value_counts().index, y=df["penthouse"].value_counts().values)

# Add axes labels 
plt.ylabel("Frequency")

# Add title
plt.title("Penthouse")

# Show the plot
plt.show()

## Correlations

In [None]:
# Calculate the correlation matrix 
corr_matrix = df[["price", "size", "bathrooms", "meters_to_mrt", "meters_to_school", "year", "high_floor", "new", 
                  "renovated", "view", "penthouse"]].corr()

# Round correlations to two decimals
corr_matrix = round(corr_matrix, 2)

# Create a mask to set the values in the upper triangle of the correlation matrix to NaN
mask = np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
corr_matrix[mask] = np.nan

# Show correlation matrix
corr_matrix

In [None]:
# Correlation heatmap
# Set the figure size
plt.figure(figsize=(8, 6))

# Create heatmap
sns.heatmap(corr_matrix, cmap="viridis", annot=True, linewidth=0.5)

# Add title
plt.title("Correlation heatmap")

# Save the heatmap as png
plt.savefig("images/correlation_heatmap.png", bbox_inches="tight")

# Show the plot
plt.show()

## Property locations on the map

In [None]:
import folium

In [None]:
# Create a map centered around Singapore
map = folium.Map(location=[1.3521, 103.8198], zoom_start=12) 

# Add property markers to the map
for index, row in df.iterrows():
    folium.Marker(
        location=[row["latitude"], row["longitude"]],
        tooltip=row["name"],  # Display property name on hover
    ).add_to(map)

# Show map
map

In [None]:
# Save the map as an HTML file
# map.save("images/map.html")  

# Save preprocessed data

In [None]:
# Delete columns that are no longer needed
df.drop(["name", "address", "property_type_furnishing_year", "mrt_distance", "agent_description", "school_location", 
         "address_new"], axis=1, inplace=True)

In [None]:
# Save preprocessed data as csv
# df.to_csv("data/rental_prices_singapore_preprocessed.csv", index=False)