In [1]:
import numpy as np
import pandas as pd
import re
import requests
from dotenv import load_dotenv
import os

# Read data

In [2]:
# Read data from csv
df = pd.read_csv("data/rental_prices_singapore.csv")

In [3]:
# Show dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5360 entries, 0 to 5359
Data columns (total 9 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   name                           5360 non-null   object 
 1   address                        5011 non-null   object 
 2   price                          5360 non-null   object 
 3   size                           5360 non-null   object 
 4   bedrooms                       5360 non-null   object 
 5   bathrooms                      4941 non-null   float64
 6   property_type_furnishing_year  5360 non-null   object 
 7   mrt_distance                   4641 non-null   object 
 8   agent_description              5360 non-null   object 
dtypes: float64(1), object(8)
memory usage: 377.0+ KB


In [4]:
# Show top five rows
df.head()

Unnamed: 0,name,address,price,size,bedrooms,bathrooms,property_type_furnishing_year,mrt_distance,agent_description
0,"Brand new Attic Studio, in a Peranakan Conserv...",Lorong 34 Geylang,3000,400 sqft,1,1.0,\nApartment\nFully Furnished\n,,One and only attic studio! Beautifully done up...
1,Astor,51C Lengkong Empat,2000,1130 sqft,Room,,\nApartment\nFully Furnished\n,11 mins (810 m) to DT28 Kaki Bukit MRT,Comes with In House Maid
2,Springhill Terrace,Sunrise avenue,7400,3800 sqft,5,4.0,\nApartment\nFully Furnished\n,,"Close to MRT and short drive to French, Austra..."
3,704 Yishun Avenue 5,704 Yishun Avenue 5,1000,120 sqft,Room,,\nApartment\nFully Furnished\n,9 mins (700 m) to NS13 Yishun MRT,Room for 1 or 2 single ladies
4,Espada,48 Saint Thomas Walk,4300,689 sqft,1,1.0,\nApartment\nFully Furnished\n,6 mins (420 m) to NS23 Somerset MRT,All units virtual online viewing available! An...


# Remove duplicates

In [5]:
# Diagnose duplicates
df.duplicated().value_counts()

False    5082
True      278
dtype: int64

In [6]:
# Remove duplicates
df = df.drop_duplicates().copy()

In [7]:
# Show dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5082 entries, 0 to 5359
Data columns (total 9 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   name                           5082 non-null   object 
 1   address                        4754 non-null   object 
 2   price                          5082 non-null   object 
 3   size                           5082 non-null   object 
 4   bedrooms                       5082 non-null   object 
 5   bathrooms                      4680 non-null   float64
 6   property_type_furnishing_year  5082 non-null   object 
 7   mrt_distance                   4418 non-null   object 
 8   agent_description              5082 non-null   object 
dtypes: float64(1), object(8)
memory usage: 397.0+ KB


# Feature engineering

## Latitude and longitude

In [8]:
# Load environment variables from .env file
load_dotenv()

True

In [9]:
# Get Google Maps API key from .env 
google_maps_api_key = os.getenv("google_maps_api_key")

In [10]:
# Create function to get latitude and longitude from an address 
def get_latitude_longitude(address):
    # Base URL for the Google Maps Geocoding API
    base_url = "https://maps.googleapis.com/maps/api/geocode/json"
    
    # Parameters for the Geocoding API request
    params = {
        "address": f"{address}, Singapore",
        "key": google_maps_api_key
    }
    
    # Send Geocoding API request and store the response
    response = requests.get(base_url, params=params)
    data = response.json()
    
    # Check if request was successful
    if data["status"] == "OK":
        # Extract latitude and longitude from the response
        location = data["results"][0]["geometry"]["location"]
        latitude = location["lat"]
        longitude = location["lng"]
    else:
        # Assign missing values and print error message if the request failed
        latitude = np.nan
        longitude = np.nan
        print(f"Geocoding request failed for {address}")
    
    # Return latitude and longitude
    return (latitude, longitude)

In [11]:
# Apply function to create latitude and longitude column (cost: ~25$)
# df[["latitude", "longitude"]] = df["address"].apply(get_latitude_longitude).apply(pd.Series)

In [12]:
# Save enriched dataframe as csv
# df.to_csv("data/rental_prices_singapore_2.csv", index=False)

In [13]:
# Load enriched data
df = pd.read_csv("data/rental_prices_singapore_2.csv")

In [14]:
# Show dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5082 entries, 0 to 5081
Data columns (total 11 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   name                           5082 non-null   object 
 1   address                        4754 non-null   object 
 2   price                          5082 non-null   object 
 3   size                           5082 non-null   object 
 4   bedrooms                       5082 non-null   object 
 5   bathrooms                      4680 non-null   float64
 6   property_type_furnishing_year  5082 non-null   object 
 7   mrt_distance                   4418 non-null   object 
 8   agent_description              5082 non-null   object 
 9   latitude                       5082 non-null   float64
 10  longitude                      5082 non-null   float64
dtypes: float64(3), object(8)
memory usage: 436.9+ KB


## Meters to school

In [15]:
# Create function to get latitude and longitude of the closest school 
def get_school_location(row):
    # Get latitude and longitude of the property
    property_latitude = row["latitude"]
    property_longitude = row["longitude"]
    
    # Base URL for the Google Maps Places Nearby Search API
    base_url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"

    # Parameters for the Nearby Search API request
    params = {
        "location": f"{property_latitude},{property_longitude}",
        "radius": 1000,  # Search radius in meters
        "type": "school",
        "key": google_maps_api_key
    }

    # Send the Nearby Search API request and store the response
    response = requests.get(base_url, params=params)
    data = response.json()
    
    # Extract latitude and longitude of the closest school from the response
    if "results" in data and data["results"]:
        closest_school = data["results"][0]
        school_name = closest_school["name"]
        school_location = closest_school["geometry"]["location"]
        school_latitude = school_location["lat"]
        school_longitude = school_location["lng"]
        print(f"Closest school: {school_name}")
        print(f"Latitude: {school_latitude}, Longitude: {school_longitude}")
    else:
        school_latitude = np.nan
        school_longitude = np.nan
        print("No schools found nearby.")
    return (school_latitude, school_longitude)

In [16]:
# Apply function to create the "school_location" column (cost: ~160$)
# df["school_location"] = df.apply(get_school_location, axis=1)

In [17]:
# Save enriched dataframe as csv
# df.to_csv("data/rental_prices_singapore_3.csv", index=False)

In [18]:
# Load enriched data
df = pd.read_csv("data/rental_prices_singapore_3.csv")

In [19]:
# Show dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5082 entries, 0 to 5081
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   name                           5082 non-null   object 
 1   address                        4754 non-null   object 
 2   price                          5082 non-null   object 
 3   size                           5082 non-null   object 
 4   bedrooms                       5082 non-null   object 
 5   bathrooms                      4680 non-null   float64
 6   property_type_furnishing_year  5082 non-null   object 
 7   mrt_distance                   4418 non-null   object 
 8   agent_description              5082 non-null   object 
 9   latitude                       5082 non-null   float64
 10  longitude                      5082 non-null   float64
 11  school_location                5065 non-null   object 
dtypes: float64(3), object(9)
memory usage: 476.6+ KB

In [20]:
# Create function to get meters to the closest school 
def get_meters_to_school(row):
    # Get latitude and longitude of the property
    property_latitude = row["latitude"]
    property_longitude = row["longitude"]
    
    # Get latitude and longitude of the school
    if pd.isna(row["school_location"]):
        return np.nan
    # Remove parentheses and split the string by comma
    latitude_str, longitude_str = row["school_location"].strip("()").split(",")
    # Convert the strings to float values
    school_latitude = float(latitude_str)
    school_longitude = float(longitude_str)

    # Base URL for the Google Maps Distance Matrix API
    base_url = "https://maps.googleapis.com/maps/api/distancematrix/json"
    
    # Parameters for the Distance Matrix API request
    params = {
        "origins": f"{property_latitude},{property_longitude}",
        "destinations": f"{school_latitude},{school_longitude}",
        "key": google_maps_api_key
    }

    # Send the Distance Matrix API request and store the response
    response = requests.get(base_url, params=params)
    data = response.json()

    # Process the response to get the distance
    if "rows" in data and data["rows"]:
        meters_to_school = data["rows"][0]["elements"][0]["distance"]["value"]
        print(f"Distance between property and closest school: {meters_to_school} meters")
    else:
        print("No distance information available.")
        return np.nan
    return meters_to_school

In [21]:
# Apply function to create the "meters_to_school" column (cost: ~25$)
# df["meters_to_school"] = df.apply(get_meters_to_school, axis=1)

In [22]:
# Save enriched dataframe as csv
# df.to_csv("data/rental_prices_singapore_4.csv", index=False)

In [23]:
# Load enriched data
df = pd.read_csv("data/rental_prices_singapore_4.csv")

In [24]:
# Show dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5082 entries, 0 to 5081
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   name                           5082 non-null   object 
 1   address                        4754 non-null   object 
 2   price                          5082 non-null   object 
 3   size                           5082 non-null   object 
 4   bedrooms                       5082 non-null   object 
 5   bathrooms                      4680 non-null   float64
 6   property_type_furnishing_year  5082 non-null   object 
 7   mrt_distance                   4418 non-null   object 
 8   agent_description              5082 non-null   object 
 9   latitude                       5082 non-null   float64
 10  longitude                      5082 non-null   float64
 11  school_location                5065 non-null   object 
 12  meters_to_school               5065 non-null   f

### Find missing addresses

In [25]:
# Create function to get missing address based on the property name via Google Maps API
def get_missing_address(row):
    # Check if address is missing
    if pd.isna(row["address"]): 
        # Base URL for the Google Maps Find Place API
        base_url = "https://maps.googleapis.com/maps/api/place/findplacefromtext/json"

        # Parameters for the Find Place API request
        params = {
            "input": f"{row['name']}, Singapore",
            "inputtype": "textquery",
            "fields": "formatted_address",
            "key": google_maps_api_key
        }

        # Send Find Place API request and store the response
        response = requests.get(base_url, params=params)
        data = response.json()

        # Check if request was successful
        if data["status"] == "OK":
            # Extract address from the response
            address = data["candidates"][0]["formatted_address"]
        # If no address was found, give notification and use the original value (i.e. np.nan)
        else:
            print(f"No address found for {row['name']}")
            address = row["address"]
    # If an address is present, use that address
    else:
        address = row["address"]
    # Return address
    return address

In [26]:
# Apply function to get missing addresses and store them in "address_new" column (cost: ~2$)
# df["address_new"] = df.apply(get_missing_address, axis=1)

In [27]:
# Save enriched dataframe as csv
# df.to_csv("data/rental_prices_singapore_5.csv", index=False)

In [28]:
# Load enriched data
df = pd.read_csv("data/rental_prices_singapore_5.csv")

In [29]:
# Show dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5082 entries, 0 to 5081
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   name                           5082 non-null   object 
 1   address                        4754 non-null   object 
 2   price                          5082 non-null   object 
 3   size                           5082 non-null   object 
 4   bedrooms                       5082 non-null   object 
 5   bathrooms                      4680 non-null   float64
 6   property_type_furnishing_year  5082 non-null   object 
 7   mrt_distance                   4418 non-null   object 
 8   agent_description              5082 non-null   object 
 9   latitude                       5082 non-null   float64
 10  longitude                      5082 non-null   float64
 11  school_location                5065 non-null   object 
 12  meters_to_school               5065 non-null   f

In [30]:
# Percent missing addresses before
print(f'Percent missing addresses before: {100 * pd.isna(df["address"]).sum() / pd.isna(df["address"]).count():.1f}%')

Percent missing addresses before: 6.5%


In [31]:
# Percent missing addresses after 
print(f'Percent missing addresses after: {100 * pd.isna(df["address_new"]).sum() / pd.isna(df["address_new"]).count():.1f}%')

Percent missing addresses after: 1.4%


Rerun code below on 01-09-2023 to update info for new addresses

In [32]:
# Function to update latitude, longitude, school_location and meters_to_school for new addresses
def new_address_update(row):
    if pd.isna(row["address"]):
        latitude_new, longitude_new = get_latitude_longitude(row["address_new"])        
        row["latitude"] = latitude_new
        row["longitude"] = longitude_new
        row["school_location"] = get_school_location(row)
        row["meters_to_school"] = get_meters_to_school(row)
    return row

In [33]:
# Apply function to update latitude, longitude, school_location and meters_to_school for new addresses
# df = df.apply(new_address_update, axis=1).copy()

In [34]:
# Save enriched dataframe as csv
# df.to_csv("data/rental_prices_singapore_6.csv", index=False)

In [35]:
# Load enriched data
df = pd.read_csv("data/rental_prices_singapore_6.csv")

In [36]:
# Show dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5082 entries, 0 to 5081
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   name                           5082 non-null   object 
 1   address                        4754 non-null   object 
 2   price                          5082 non-null   object 
 3   size                           5082 non-null   object 
 4   bedrooms                       5082 non-null   object 
 5   bathrooms                      4680 non-null   float64
 6   property_type_furnishing_year  5082 non-null   object 
 7   mrt_distance                   4418 non-null   object 
 8   agent_description              5082 non-null   object 
 9   latitude                       4754 non-null   float64
 10  longitude                      4754 non-null   float64
 11  school_location                4737 non-null   object 
 12  meters_to_school               4737 non-null   f

## Good restaurants nearby

In [37]:
# Create function to get the average Google Maps rating of nearby restaurants 
def get_restaurants_rating(row):
    # Get latitude and longitude of the property
    property_latitude = row["latitude"]
    property_longitude = row["longitude"]
    
    # Base URL for the Google Maps Places Nearby Search API
    base_url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"

    # Parameters for the Nearby Search API request
    params = {
        "location": f"{property_latitude},{property_longitude}",
        "radius": 1000,  # Search radius in meters
        "type": "restaurant",
        "key": google_maps_api_key
    }

    # Send the Nearby Search API request and store the response
    response = requests.get(base_url, params=params)
    data = response.json()
    
    # Process the response to get the average restaurant rating
    if "results" in data and data["results"]:
        # Extract restaurant ratings as a list, assigning np.nan for missing ratings
        rating_list = [restaurant.get("rating", np.nan) for restaurant in data.get("results")]
        # Calculate average rating, ignoring np.nan values
        average_rating = np.nanmean(rating_list)
        print(f"Number of restaurants: {len(rating_list)}")
        print(f"Number of ratings: {len([rating for rating in rating_list if not np.isnan(rating)])}")
        print(f"Average rating: {average_rating:.2f}")
    else:
        print("No restaurants found nearby.")
        return np.nan
    return average_rating

In [38]:
# Apply function to create the "restaurants_rating" column (cost: ~160$)
# df["restaurants_rating"] = df.apply(get_restaurants_rating, axis=1)

In [39]:
# Save enriched dataframe as csv
# df.to_csv("data/rental_prices_singapore_7.csv", index=False)

In [40]:
# Load enriched data
# df = pd.read_csv("data/rental_prices_singapore_7.csv")

In [41]:
# Show dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5082 entries, 0 to 5081
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   name                           5082 non-null   object 
 1   address                        4754 non-null   object 
 2   price                          5082 non-null   object 
 3   size                           5082 non-null   object 
 4   bedrooms                       5082 non-null   object 
 5   bathrooms                      4680 non-null   float64
 6   property_type_furnishing_year  5082 non-null   object 
 7   mrt_distance                   4418 non-null   object 
 8   agent_description              5082 non-null   object 
 9   latitude                       4754 non-null   float64
 10  longitude                      4754 non-null   float64
 11  school_location                4737 non-null   object 
 12  meters_to_school               4737 non-null   f

## Meters to central business district

In [42]:
# Create function to get meters to central business district 
def get_meters_to_cbd(row):
    # Get latitude and longitude of the property
    property_latitude = row["latitude"]
    property_longitude = row["longitude"]
    
    # Latitude and longitude of central business district (i.e. Raffles Place)
    cbd_latitude = 1.284184
    cbd_longitude = 103.85151

    # Base URL for the Google Maps Distance Matrix API
    base_url = "https://maps.googleapis.com/maps/api/distancematrix/json"
    
    # Parameters for the Distance Matrix API request
    params = {
        "origins": f"{property_latitude},{property_longitude}",
        "destinations": f"{cbd_latitude},{cbd_longitude}",
        "key": google_maps_api_key
    }

    # Send the Distance Matrix API request and store the response
    response = requests.get(base_url, params=params)
    data = response.json()

    # Process the response to get the distance
    if "rows" in data and data["rows"]:
        meters_to_cbd = data["rows"][0]["elements"][0]["distance"]["value"]
        print(f"Distance between property and CBD: {meters_to_cbd} meters")
    else:
        print("No distance information available.")
        return np.nan
    return meters_to_cbd

In [43]:
# Apply function to create the "meters_to_cbd" column (cost: ~25$)
# df["meters_to_cbd"] = df.apply(get_meters_to_cbd, axis=1)

In [44]:
# Save enriched dataframe as csv
# df.to_csv("data/rental_prices_singapore_8.csv", index=False)

In [45]:
# Load enriched data
# df = pd.read_csv("data/rental_prices_singapore_8.csv")

In [46]:
# Show dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5082 entries, 0 to 5081
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   name                           5082 non-null   object 
 1   address                        4754 non-null   object 
 2   price                          5082 non-null   object 
 3   size                           5082 non-null   object 
 4   bedrooms                       5082 non-null   object 
 5   bathrooms                      4680 non-null   float64
 6   property_type_furnishing_year  5082 non-null   object 
 7   mrt_distance                   4418 non-null   object 
 8   agent_description              5082 non-null   object 
 9   latitude                       4754 non-null   float64
 10  longitude                      4754 non-null   float64
 11  school_location                4737 non-null   object 
 12  meters_to_school               4737 non-null   f

# Feature extraction

## Property type

In [47]:
# Create function to extract property type 
def extract_type(string):
    if "Condominium" in string:
        return "Condominium"
    elif "Apartment" in string:
        return "Apartment"
    elif "HDB Flat" in string:
        return "HDB Flat"
    elif "Semi-Detached House" in string:
        return "Semi-Detached House"
    elif "Good Class Bungalow" in string:
        return "Good Class Bungalow"
    elif "Corner Terrace" in string:
        return "Corner Terrace"
    elif "Detached House" in string:
        return "Detached House"
    elif "Executive Condominium" in string:
        return "Executive Condominium"
    elif "Terraced House" in string:
        return "Terraced House"
    elif "Bungalow House" in string:
        return "Bungalow House"
    elif "Cluster House" in string:
        return "Cluster House"
    else:
        return np.nan

In [48]:
# Apply function to create property type column
df["property_type"] = df["property_type_furnishing_year"].apply(extract_type)

In [49]:
# Frequencies of property types
df["property_type"].value_counts()

Condominium            2770
Apartment               905
HDB Flat                763
Semi-Detached House     188
Good Class Bungalow     174
Corner Terrace          103
Detached House           79
Terraced House           40
Bungalow House           40
Cluster House            20
Name: property_type, dtype: int64

## Furnishing

In [50]:
# Create function to extract information about furnishing  
def extract_furnishing(string):
    if "Fully Furnished" in string:
        return "Fully Furnished"
    elif "Partially Furnished" in string:
        return "Partially Furnished"
    elif "Unfurnished" in string:
        return "Unfurnished"
    else:
        return np.nan

In [51]:
# Apply function to create furnishing column
df["furnishing"] = df["property_type_furnishing_year"].apply(extract_furnishing)

In [52]:
# Frequencies of furnishing
df["furnishing"].value_counts()

Partially Furnished    3058
Fully Furnished        1600
Unfurnished              84
Name: furnishing, dtype: int64

## Built year

In [53]:
# Create function to extract built year 
def extract_year(string):
    year = re.search(r"\b\d{4}\b", string)
    if year:
        return year.group()
    else:
        return np.nan

In [54]:
# Apply function to create built year column
df["year"] = df["property_type_furnishing_year"].apply(extract_year).astype("Int32")

In [55]:
# Frequencies of built year
df["year"].value_counts()

2023    391
2024    314
2013    297
2017    293
2010    248
2011    239
2014    178
2007    167
2016    160
2009    145
2022    142
2015    140
2021    140
2012    112
2008     99
1997     87
2019     80
2005     80
1995     80
1999     80
2000     63
2001     62
1984     60
1993     60
1985     60
2004     60
1994     60
1986     43
1996     40
1998     40
1987     40
2018     40
1981     40
2003     40
1975     38
1992     20
1971     20
1991     20
2006     20
1983     20
1974     20
1979     20
2002     20
1977     20
Name: year, dtype: Int64

## Meters to MRT

In [56]:
# Extract MRT distance in meters
df["meters_to_mrt"] = df["mrt_distance"].str.split(r"m\)").str[0].str.split(r"\(").str[1].astype("Int32")

## Renovated

In [57]:
# Create function to extract information about renovation  
def extract_renovated(string):
    if "renovated" in string.lower() or "renovation" in string.lower():
        return True
    else:
        return False

In [58]:
# Apply function to create renovated column
df["renovated"] = df["agent_description"].apply(extract_renovated)

## Pool

In [59]:
# Create function to extract information about pool  
def extract_pool(string):
    if "pool" in string.lower():
        return True
    else:
        return False

In [60]:
# Apply function to create pool column
df["pool"] = df["agent_description"].apply(extract_pool)

## Sea view

In [61]:
# Create function to extract information about sea view  
def extract_sea_view(string):
    if "seaview" in string.lower() or "sea view" in string.lower():
        return True
    else:
        return False

In [62]:
# Apply function to create sea view column
df["sea_view"] = df["agent_description"].apply(extract_sea_view)

## Penthouse

In [63]:
# Create function to extract penthouse information 
def extract_penthouse(string):
    if "penthouse" in string.lower():
        return True
    else:
        return False

In [64]:
# Apply function to create penthouse column
df["penthouse"] = df["agent_description"].apply(extract_penthouse)

# Convert data types

In [65]:
# Convert price from str to int
df["price"] = df["price"].str.replace(",", "").astype("Int32")

In [66]:
# Convert size (sqft) from str to int
df["size"] = df["size"].str.split("sqft").str[0].astype("Int32")

In [67]:
# Convert bathrooms from str to int
df["bathrooms"] = df["bathrooms"].astype("Int32")

In [68]:
# Convert meters_to_school from float to int
df["meters_to_school"] = df["meters_to_school"].astype("Int32")

# Handle missing values

## Bathrooms

In [69]:
# Assume 1 bathroom for a studio
df["bathrooms"][df["bedrooms"]=="Studio"] = 1
# Assume 1 bathroom for a room in a shared flat
df["bathrooms"][df["bedrooms"]=="Room"] = 1
# Else assume the same number as bedrooms
df["bathrooms"][pd.isna(df["bathrooms"])] = df["bedrooms"]

## Latitude and longitude

In [70]:
# Latitude and Longitude: Delete rows

## Meters to school

In [71]:
# Meters to school: Mean imputation

## MRT distancce

In [72]:
# Mean imputation

## Furnishing

In [73]:
# Mode imputation

## Built year

In [74]:
# Mean imputation

# Check data

In [75]:
# Show dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5082 entries, 0 to 5081
Data columns (total 22 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   name                           5082 non-null   object 
 1   address                        4754 non-null   object 
 2   price                          5082 non-null   Int32  
 3   size                           5082 non-null   Int32  
 4   bedrooms                       5082 non-null   object 
 5   bathrooms                      5082 non-null   Int32  
 6   property_type_furnishing_year  5082 non-null   object 
 7   mrt_distance                   4418 non-null   object 
 8   agent_description              5082 non-null   object 
 9   latitude                       4754 non-null   float64
 10  longitude                      4754 non-null   float64
 11  school_location                4737 non-null   object 
 12  meters_to_school               4737 non-null   I

In [76]:
# Show top and bottom five rows
df

Unnamed: 0,name,address,price,size,bedrooms,bathrooms,property_type_furnishing_year,mrt_distance,agent_description,latitude,...,meters_to_school,address_new,property_type,furnishing,year,meters_to_mrt,renovated,pool,sea_view,penthouse
0,"Brand new Attic Studio, in a Peranakan Conserv...",Lorong 34 Geylang,3000,400,1,1,\nApartment\nFully Furnished\n,,One and only attic studio! Beautifully done up...,1.312952,...,422,Lorong 34 Geylang,Apartment,Fully Furnished,,,False,False,False,False
1,Astor,51C Lengkong Empat,2000,1130,Room,1,\nApartment\nFully Furnished\n,11 mins (810 m) to DT28 Kaki Bukit MRT,Comes with In House Maid,1.328820,...,3573,51C Lengkong Empat,Apartment,Fully Furnished,,810,False,False,False,False
2,Springhill Terrace,Sunrise avenue,7400,3800,5,4,\nApartment\nFully Furnished\n,,"Close to MRT and short drive to French, Austra...",1.389444,...,568,Sunrise avenue,Apartment,Fully Furnished,,,False,False,False,False
3,704 Yishun Avenue 5,704 Yishun Avenue 5,1000,120,Room,1,\nApartment\nFully Furnished\n,9 mins (700 m) to NS13 Yishun MRT,Room for 1 or 2 single ladies,1.429261,...,1090,704 Yishun Avenue 5,Apartment,Fully Furnished,,700,False,False,False,False
4,Espada,48 Saint Thomas Walk,4300,689,1,1,\nApartment\nFully Furnished\n,6 mins (420 m) to NS23 Somerset MRT,All units virtual online viewing available! An...,1.297356,...,1262,48 Saint Thomas Walk,Apartment,Fully Furnished,,420,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5077,Avenue South Residence,13 Silat Avenue,7500,947,3,2,\nBungalow House\nPartially Furnished\n,,Newly TOP! Be the first to stay near town!,1.276469,...,797,13 Silat Avenue,Bungalow House,Partially Furnished,,,False,False,False,False
5078,453B Fernvale Road,453B Fernvale Road,999,150,Room,1,\nBungalow House\nPartially Furnished\n,4 mins (290 m) to SW5 Fernvale LRT,Close to fernvale LRT,1.393888,...,1510,453B Fernvale Road,Bungalow House,Partially Furnished,,290,False,False,False,False
5079,Riviere,1 Jiak Kim Street,8000,840,2,2,\nBungalow House\nPartially Furnished\n,5 mins (370 m) to TE16 Havelock MRT,Brand new pool facing unit!,1.291419,...,1422,1 Jiak Kim Street,Bungalow House,Partially Furnished,,370,False,True,False,False
5080,Bukit Villas,1 Rasok Drive,7100,3025,5,4,\nBungalow House\nPartially Furnished\n,,"Tranquil surroundings, facilities included! Po...",1.416662,...,,1 Rasok Drive,Bungalow House,Partially Furnished,,,False,True,False,False


# Exploratory data analysis

# Delete columns

In [None]:
# Delete columns that are no longer needed
df.drop(["name", "address", "property_type_furnishing_year", "mrt_distance", "agent_description", "school_location", 
         "address_new"], axis=1, inplace=True)

# To Do
- Exploratory data analysis