In [1]:
import requests
import csv
from pathlib import Path

### configs

In [2]:
data_dir = Path("data")
data_dir.mkdir(parents=True, exist_ok=True)

### BTO data

In [4]:
# --------------------------
# Download BTO Dataset (Apr 2008 – Mar 2023)
# --------------------------
bto_dataset_id = "d_2d493bdcc1d9a44828b6e71cb095b88d"
bto_url = f"https://data.gov.sg/api/action/datastore_search?resource_id={bto_dataset_id}&limit=50000"

print("Downloading BTO dataset...")
try:
    bto_response = requests.get(bto_url).json()
    
    if bto_response.get("success") and "result" in bto_response:
        bto_records = bto_response["result"]["records"]
        if bto_records:
            bto_filename = data_dir / "BTO_prices_Apr2008_Mar2023.csv"
            with open(bto_filename, mode="w", newline="", encoding="utf-8") as file:
                writer = csv.DictWriter(file, fieldnames=bto_records[0].keys())
                writer.writeheader()
                writer.writerows(bto_records)
            print(f"✅ Saved {bto_filename}")
        else:
            print("❌ No BTO records found.")
    else:
        print("❌ Failed to fetch BTO dataset.")
except Exception as e:
    print(f"❌ Error downloading BTO dataset: {e}")

Downloading BTO dataset...
✅ Saved data/BTO_prices_Apr2008_Mar2023.csv


### Resale data

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed


collection_id = "189"
url_metadata = f"https://api-production.data.gov.sg/v2/public/api/collections/{collection_id}/metadata"

def fetch_dataset(dataset_id):
    print(f"⬇️ Downloading dataset: {dataset_id}")
    all_records = []
    offset = 0
    limit = 5000
    base_url = "https://data.gov.sg/api/action/datastore_search"

    while True:
        url = f"{base_url}?resource_id={dataset_id}&limit={limit}&offset={offset}"
        response = requests.get(url)
        data = response.json()

        if not data.get('success'):
            print(f"❌ Failed to fetch {dataset_id}: {data.get('error', {})}")
            return

        records = data['result']['records']
        if not records:
            break

        all_records.extend(records)
        offset += limit

    if not all_records:
        print(f"⚠️ No records found for {dataset_id}")
        return

    filename = data_dir / f"resale_{dataset_id}.csv"
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=all_records[0].keys())
        writer.writeheader()
        writer.writerows(all_records)

    print(f"✅ Saved {filename} with {len(all_records)} rows")


try:
    response = requests.get(url_metadata)
    collection_data = response.json()
    child_datasets = collection_data['data']['collectionMetadata']['childDatasets']

    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(fetch_dataset, ds): ds for ds in child_datasets}
        for future in as_completed(futures):
            ds = futures[future]
            try:
                future.result()
            except Exception as e:
                print(f"❌ Error with dataset {ds}: {e}")

except Exception as e:
    print(f"❌ Error fetching metadata: {e}")


⬇️ Downloading dataset: d_8b84c4ee58e3cfc0ece0d773c8ca6abc
⬇️ Downloading dataset: d_43f493c6c50d54243cc1eab0df142d6a
⬇️ Downloading dataset: d_2d5ff9ea31397b66239f245f57751537
⬇️ Downloading dataset: d_ebc5ab87086db484f88045b47411ebc5
⬇️ Downloading dataset: d_ea9ed51da2787afaf8e51f827c304208


KeyboardInterrupt: 

### Adding new column for resale prices

In [3]:
from dotenv import load_dotenv
import os
import re
import requests

load_dotenv()

True

In [2]:
## function to get latitude and longitude given text inputs using onemap api: https://www.onemap.gov.sg/apidocs/search
def get_lat_long(search_val, page_num=1):
    """    
    Parameters:
    search_val (str): The search text (address, postal code, etc.)
    page_num (int): Page number for results (default: 1)
    
    Returns:
    list: List of dictionaries containing lat, long, and address info for each result
    """
    url = "https://www.onemap.gov.sg/api/common/elastic/search"
    params = {
        "searchVal": search_val,
        "returnGeom": "Y",
        "getAddrDetails": "Y",
        "pageNum": page_num
    }
    api_key = os.getenv('ONE_MAP_API_KEY')
    
    if not api_key:
        raise ValueError("API key not found in environment variables")
    
    headers = {"Authorization": f"Bearer {api_key}"}   
    response = requests.get(url, headers=headers, params=params)
    data = response.json()
    results = []
    if 'results' in data and data['results']:
        for result in data['results']:
            results.append({
                'latitude': result.get('LATITUDE', 'N/A'),
                'longitude': result.get('LONGITUDE', 'N/A'),
                'address': result.get('ADDRESS', 'N/A'),
                'postal': result.get('POSTAL', 'N/A')
            })
    return results


results = get_lat_long("WOODLANDS DR 60")
for i, result in enumerate(results[:3]):  # First 3 results
    print(f"Result {i+1}:")
    print(f"  Address: {result['address']}")
    print(f"  Postal: {result['postal']}")
    print(f"  Latitude: {result['latitude']}")
    print(f"  Longitude: {result['longitude']}")
    print()

Result 1:
  Address: WOODLANDS DRIVE 60
  Postal: NIL
  Latitude: 1.4470300695077
  Longitude: 103.799839531114



#### methods for computing nearest public transport
- bus and lrt stops categorised together

In [3]:
import os
import requests
import math
import re

def get_nearest_mrt_stations(latitude, longitude, radius_in_m):
    """    
    Parameters:
        latitude (float): Latitude coordinate
        longitude (float): Longitude coordinate
        radius_in_m (int): Search radius in meters
    Returns:
        list: List of dictionaries containing station info
    """
    api_key = os.getenv('ONE_MAP_API_KEY')
    if not api_key:
        raise ValueError("API key not found in environment variables")

    headers = {"Authorization": f"Bearer {api_key}"}
    url = (
        f"https://www.onemap.gov.sg/api/public/nearbysvc/getNearestMrtStops?"
        f"latitude={latitude}&longitude={longitude}&radius_in_meters={radius_in_m}"
    )

    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"API request failed with status code {response.status_code}")

    try:
        data = response.json()
        # print(data)
        return data
    except Exception as e:
        print(f"Error parsing JSON response: {e}")
        return []
    

def get_nearest_bus_stops(latitude, longitude, radius_in_m):
    """    
    Parameters:
        latitude (float): Latitude coordinate
        longitude (float): Longitude coordinate
        radius_in_m (int): Search radius in meters
    
    Returns:
        list: List of dictionaries containing bus stop info
    """
    api_key = os.getenv('ONE_MAP_API_KEY')
    if not api_key:
        raise ValueError("API key not found in environment variables")

    headers = {"Authorization": f"Bearer {api_key}"}
    url = (
        f"https://www.onemap.gov.sg/api/public/nearbysvc/getNearestBusStops?"
        f"latitude={latitude}&longitude={longitude}&radius_in_meters={radius_in_m}"
    )

    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"API request failed with status code {response.status_code}")

    try:
        data = response.json()
        # print(data)
        return data
    except Exception as e:
        print(f"Error parsing JSON response: {e}")
        return []


def count_mrt_stations(latitude, longitude, radius_in_m):
    """
    Parameters:
        latitude (float): Latitude coordinate
        longitude (float): Longitude coordinate
        radius_in_m (int): Search radius in meters
    
    Returns:
        int: Number of MRT stations within the radius
    """
    stations = get_nearest_mrt_stations(latitude, longitude, radius_in_m)
    
    # Filter only MRT stations (not LRT)
    mrt_count = 0
    for station in stations:
        station_name = station.get('name', '').upper()
        # Check if it's an MRT station (not LRT)
        if re.search(r'MRT\s+STATION', station_name) or re.search(r'MRT$', station_name) or ('MRT' in station_name and 'LRT' not in station_name):
            mrt_count += 1
    
    return mrt_count


def count_lrt_stations(latitude, longitude, radius_in_m):
    """
    Parameters:
        latitude (float): Latitude coordinate
        longitude (float): Longitude coordinate
        radius_in_m (int): Search radius in meters
    Returns:
        int: Number of LRT stations within the radius
    """
    stations = get_nearest_mrt_stations(latitude, longitude, radius_in_m)
    
    # Filter only LRT stations (not MRT)
    lrt_count = 0
    for station in stations:
        station_name = station.get('name', '').upper()
        # Check if it's an LRT station (not MRT)
        if re.search(r'LRT\s+STATION', station_name) or re.search(r'LRT$', station_name) or ('LRT' in station_name and 'MRT' not in station_name):
            lrt_count += 1
    
    return lrt_count


def count_bus_stops(latitude, longitude, radius_in_m):
    """    
    Parameters:
        latitude (float): Latitude coordinate
        longitude (float): Longitude coordinate
        radius_in_m (int): Search radius in meters
    Returns:
        int: Number of bus stops within the radius
    """
    bus_stops = get_nearest_bus_stops(latitude, longitude, radius_in_m)
    return len(bus_stops)


def haversine_distance(lat1, lon1, lat2, lon2):
    """
    using the Haversine formula.
    Parameters:
        lat1, lon1: First point (latitude, longitude)
        lat2, lon2: Second point (latitude, longitude)
    
    Returns:
        float: Distance in kilometers
    """
    R = 6371  # Earth radius in kilometers

    dlat = math.radians(lat2 - lat1)
    dlon = math.radians(lon2 - lon1)
    a = math.sin(dlat / 2)**2 + math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = R * c
    return distance


def nearest_mrt_distance(latitude, longitude):
    """
    Parameters:
        latitude (float): Latitude coordinate
        longitude (float): Longitude coordinate
    
    Returns:
        tuple: (distance in km, nearest station info) or (None, None) if no MRT found
    """
    stations = get_nearest_mrt_stations(latitude, longitude, 2000)  # Search up to 2km
    if not stations:
        return None, None

    min_dist = float('inf')
    nearest_station = None

    for station in stations:
        station_name = station.get('name', '').upper()
        # Only consider MRT stations
        if re.search(r'MRT\s+STATION', station_name) or re.search(r'MRT$', station_name) or ('MRT' in station_name and 'LRT' not in station_name):
            station_lat = station['lat']
            station_lon = station['lon']
            dist = haversine_distance(latitude, longitude, station_lat, station_lon)
            if dist < min_dist:
                min_dist = dist
                nearest_station = station

    if nearest_station:
        return round(min_dist, 3), nearest_station
    else:
        return None, None


def nearest_lrt_distance(latitude, longitude):
    """
    Parameters:
        latitude (float): Latitude coordinate
        longitude (float): Longitude coordinate
    
    Returns:
        tuple: (distance in km, nearest station info) or (None, None) if no LRT found
    """
    stations = get_nearest_mrt_stations(latitude, longitude, 2000)  # Search up to 2km
    if not stations:
        return None, None

    min_dist = float('inf')
    nearest_station = None

    for station in stations:
        station_name = station.get('name', '').upper()
        # Only consider LRT stations
        if re.search(r'LRT\s+STATION', station_name) or re.search(r'LRT$', station_name) or ('LRT' in station_name and 'MRT' not in station_name):
            station_lat = station['lat']
            station_lon = station['lon']
            dist = haversine_distance(latitude, longitude, station_lat, station_lon)
            if dist < min_dist:
                min_dist = dist
                nearest_station = station

    if nearest_station:
        return round(min_dist, 3), nearest_station
    else:
        return None, None


def nearest_bus_stop_distance(latitude, longitude):
    """    
    Parameters:
        latitude (float): Latitude coordinate
        longitude (float): Longitude coordinate
    
    Returns:
        tuple: (distance in km, nearest bus stop info) or (None, None) if no bus stop found
    """
    bus_stops = get_nearest_bus_stops(latitude, longitude, 2000)  # Search up to 2km
    if not bus_stops:
        return None, None

    min_dist = float('inf')
    nearest_stop = None

    for stop in bus_stops:
        stop_lat = stop['lat']
        stop_lon = stop['lon']
        dist = haversine_distance(latitude, longitude, stop_lat, stop_lon)
        if dist < min_dist:
            min_dist = dist
            nearest_stop = stop

    if nearest_stop:
        return round(min_dist, 3), nearest_stop
    else:
        return None, None

In [4]:
lat = 1.4470300695077
lng = 103.799839531114

count_mrt_1km = count_mrt_stations(lat, lng, 1000)
count_lrt_1km = count_lrt_stations(lat, lng, 400)
count_bus_1km = count_bus_stops(lat, lng, 400)

print(f"MRT stations within 1km: {count_mrt_1km}")
print(f"LRT stations within 1km: {count_lrt_1km}")
print(f"Bus stops within 1km: {count_bus_1km}")

mrt_distance, mrt_station = nearest_mrt_distance(lat, lng)
lrt_distance, lrt_station = nearest_lrt_distance(lat, lng)
bus_distance, bus_station = nearest_bus_stop_distance(lat, lng)

if mrt_distance:
    print(f"Nearest MRT distance: {mrt_distance} km")
if lrt_distance:
    print(f"Nearest LRT distance: {lrt_distance} km")
if bus_distance:
    print(f"Nearest Bus distance: {bus_distance} km")


MRT stations within 1km: 1
LRT stations within 1km: 0
Bus stops within 1km: 11
Nearest MRT distance: 0.733 km
Nearest Bus distance: 0.13 km


In [5]:
def get_new_features(text_address):
    results = get_lat_long(text_address)[0]
    lat = float(results["latitude"])
    long = float(results["longitude"])
    result = {}
    
    result["num_mrt_within_1k"] = count_mrt_stations(lat, long, 1000)
    result["num_lrt_bs_within_400"] = count_lrt_stations(lat, long, 400) + count_bus_stops(lat, long, 400)
    
    # Handle nearest MRT distance
    nearest_mrt_dist, _ = nearest_mrt_distance(lat, long)
    result["nearest_mrt_distance"] = nearest_mrt_dist if nearest_mrt_dist is not None else 0
    
    # Handle nearest LRT/Bus stop distance
    lrt_dist, _ = nearest_lrt_distance(lat, long)
    bus_dist, _ = nearest_bus_stop_distance(lat, long)
    
    # Get minimum non-None distance, or 0 if both are None
    distances = [d for d in [lrt_dist, bus_dist] if d is not None]
    result["nearest_lrt_bs_distance"] = min(distances) if distances else 0
    
    return result

In [6]:
print(get_new_features("Punggol Field Walk"))

{'num_mrt_within_1k': 0, 'num_lrt_bs_within_400': 5, 'nearest_mrt_distance': 1.807, 'nearest_lrt_bs_distance': 0.176}


In [7]:
import polars as pl
import glob
import os
from tqdm import tqdm
import time

In [9]:
import pandas as pd
import glob
import os
from tqdm import tqdm

pattern = "data/resale_*.csv"
csv_files = glob.glob(pattern)

# First pass: count total rows for overall progress
total_rows = 0
for file_path in csv_files:
    df = pd.read_csv(file_path)
    total_rows += len(df)

# Process files with overall progress tracking
with tqdm(total=total_rows, desc="Overall Progress") as pbar:
    for file_path in csv_files:
        print(f"Processing {file_path}...")
        
        df = pd.read_csv(file_path)
        
        df['full_address'] = df['block'].astype(str) + ', ' + df['street_name']
        
        # Apply get_new_features to each row
        feature_data = []
        for addr in df['full_address']:
            try:
                features = get_new_features(addr)
                feature_data.append(features)
            except Exception as e:
                print(f"Error processing address '{addr}': {e}")
                feature_data.append({
                    "num_mrt_within_1k": 0,
                    "num_lrt_bs_within_400": 0,
                    "nearest_mrt_distance": 0,
                    "nearest_lrt_bs_distance": 0
                })
            # Update progress bar
            pbar.update(1)
        
        # Convert to DataFrame and merge
        features_df = pd.DataFrame(feature_data)
        df = pd.concat([df, features_df], axis=1)
        
        # Save back to file
        df.to_csv(f"{file_path}_updated", index=False)
        print(f"Updated: {file_path}")

Overall Progress:   0%|                                                                                                                                                                               | 0/960898 [00:00<?, ?it/s]

Processing data/resale_d_2d5ff9ea31397b66239f245f57751537.csv...


Overall Progress:   0%|                                                                                                                                                                   | 2/960898 [00:06<847:04:04,  3.17s/it]


KeyboardInterrupt: 

### Loading into DB

In [13]:
import sqlite3
import pandas as pd
from pathlib import Path
import os

# Setup paths
data_dir = Path("./data")
db_path = data_dir / "hdb_prices.db"

data_dir.mkdir(exist_ok=True)

# Remove existing DB if rewriting
if db_path.exists():
    os.remove(db_path)
    print("🗑️ Old database removed")

# Connect to database
try:
    conn = sqlite3.connect(db_path)
    print("✅ Connected successfully!")
except Exception as e:
    print(f"❌ Connection failed: {e}")
    raise

cursor = conn.cursor()

# Create tables (fresh schema)
cursor.execute("""
CREATE TABLE bto_prices (
    _id INTEGER PRIMARY KEY AUTOINCREMENT,
    financial_year TEXT,
    room_type TEXT,
    town TEXT,
    min_selling_price REAL,
    max_selling_price REAL,
    min_selling_price_less_ahg_shg REAL,
    max_selling_price_less_ahg_shg REAL
)
""")

cursor.execute("""
CREATE TABLE resale_prices (
    _id INTEGER PRIMARY KEY AUTOINCREMENT,
    month TEXT,
    town TEXT,
    flat_type TEXT,
    flat_model TEXT,
    block TEXT,
    street_name TEXT,
    storey_range TEXT,
    floor_area_sqm REAL,
    lease_commence_date TEXT,
    resale_price REAL
)
""")

conn.commit()

# Match schema exactly
resale_columns = [
    "month", "town", "flat_type", "flat_model", "block", "street_name",
    "storey_range", "floor_area_sqm", "lease_commence_date", "resale_price"
]

bto_columns = [
    "financial_year", "room_type", "town",
    "min_selling_price", "max_selling_price",
    "min_selling_price_less_ahg_shg", "max_selling_price_less_ahg_shg"
]

def load_csv_to_sqlite(csv_file, table_name, table_columns):
    df = pd.read_csv(csv_file)

    # Keep only valid columns
    available_columns = [col for col in table_columns if col in df.columns]
    df = df[available_columns]

    # Add missing columns as None
    for col in table_columns:
        if col not in df.columns:
            df[col] = None

    # Normalize: lowercase everything (ensure bto and resale consistent)
    df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)

    # Special handling for flat_type (ensure bto and resale consistent)
    if "flat_type" in df.columns:
        df["flat_type"] = (
            df["flat_type"]
            .str.replace(r"(\d+)\s*room", r"\1-room", regex=True)
        )

    # Reorder
    df = df[table_columns]

    # Insert
    df.to_sql(table_name, conn, if_exists="append", index=False)
    print(f"✅ Loaded {len(df)} rows into {table_name}")


# Load BTO data
bto_file = data_dir / "BTO_prices_Apr2008_Mar2023.csv"
if bto_file.exists():
    load_csv_to_sqlite(bto_file, "bto_prices", bto_columns)

# Load Resale data
for resale_file in data_dir.glob("resale_*.csv"):
    load_csv_to_sqlite(resale_file, "resale_prices", resale_columns)

# Close connection
conn.close()
print("🔒 Connection closed")

🗑️ Old database removed
✅ Connected successfully!
✅ Loaded 333 rows into bto_prices


  df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)
  df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)


✅ Loaded 52203 rows into resale_prices


  df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)


✅ Loaded 369651 rows into resale_prices


  df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)


✅ Loaded 214695 rows into resale_prices


  df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)


✅ Loaded 37153 rows into resale_prices


  df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)


✅ Loaded 287196 rows into resale_prices
🔒 Connection closed
