In [2]:
import os
from pathlib import Path
import duckdb
import platform

DATA_DIRECTORY = Path(os.path.abspath("")).parent / "data"
DATABASE_PATH = DATA_DIRECTORY / "stage_1.db"
SOURCE_DATA_DIRECTORY = DATA_DIRECTORY / "source_data"
if platform.system() == "Windows":
    HTTP_PROXY = os.environ["http_proxy"]
else:
    HTTP_PROXY = ""

In [None]:
# delete the database if it exists
DATABASE_PATH.unlink(missing_ok=True)
# create the database
connection = duckdb.connect(str(DATABASE_PATH))
# install the spatial extension
with duckdb.connect(str(DATABASE_PATH)) as connection:
    connection.sql(f"SET http_proxy TO '{HTTP_PROXY}'")
    connection.sql(f"INSTALL spatial")
    connection.sql(f"LOAD spatial")
print(f"✅ Created a persistent database at: {DATABASE_PATH}")

## Load DCP PLUTO data

In [None]:
pluto_csv_path = SOURCE_DATA_DIRECTORY / "pluto.csv"

print("Creating PLUTO table from local csv ...")
with duckdb.connect(str(DATABASE_PATH)) as connection:
    connection.sql(f"DESCRIBE TABLE '{pluto_csv_path}'").show()
    connection.sql(f"CREATE TABLE pluto as SELECT * FROM '{pluto_csv_path}'")
    connection.sql(f"SELECT count(*) FROM pluto").show()

print("✅ Loaded PLUTO data")

## Load Airbnb NYC data

In [None]:
airbnb_nyc_listings_url = "https://data.insideairbnb.com/united-states/ny/new-york-city/2024-07-05/visualisations/listings.csv"
airbnb_nyc_detailed_listings_url = "https://data.insideairbnb.com/united-states/ny/new-york-city/2024-07-05/data/listings.csv.gz"

print("Creating Airbnb NYC listings table from remote csv file ...")
with duckdb.connect(str(DATABASE_PATH)) as connection:
    connection.sql(f"SET http_proxy TO '{HTTP_PROXY}'")
    connection.sql(f"DESCRIBE TABLE '{airbnb_nyc_listings_url}'").show()
    connection.sql(
        f"CREATE TABLE airbnb_nyc_listings as SELECT * FROM '{airbnb_nyc_listings_url}'"
    )
    connection.sql(f"SELECT count(*) FROM airbnb_nyc_listings").show()

print(
    "Creating Airbnb NYC detailed listings table from remote compressed csv file ..."
)
with duckdb.connect(str(DATABASE_PATH)) as connection:
    connection.sql(f"SET http_proxy TO '{HTTP_PROXY}'")
    connection.sql(f"DESCRIBE TABLE '{airbnb_nyc_detailed_listings_url}'").show()
    connection.sql(
        f"CREATE TABLE airbnb_nyc_detailed_listings as SELECT * FROM '{airbnb_nyc_detailed_listings_url}'"
    )
    connection.sql(f"SELECT count(*) FROM airbnb_nyc_detailed_listings").show()

print("✅ Loaded Airbnb data")

## Load TLC taxi data

In [None]:
yellow_cab_trips_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet"
green_cab_trips_url = (
    "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-01.parquet"
)
for_hire_trips_url = (
    "https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2024-01.parquet"
)
high_volume_for_hire_trips_url = (
    "https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-01.parquet"
)
tlc_trip_urls = [
    yellow_cab_trips_url,
    green_cab_trips_url,
    for_hire_trips_url,
    high_volume_for_hire_trips_url,
]
print("Creating a TLC trips table from remote parquet files ...")
with duckdb.connect(str(DATABASE_PATH)) as connection:
    connection.sql(f"SET http_proxy TO '{HTTP_PROXY}'")

    for remote_table in tlc_trip_urls:
        print(f"Describing {remote_table} ...")
        connection.sql(f"DESCRIBE TABLE '{remote_table}'").show()

    print(f"Creating table 'tlc_trips' from all files ...")
    connection.sql(
        f"CREATE TABLE tlc_trips as SELECT * FROM read_parquet({tlc_trip_urls}, union_by_name = true, filename = true)"
    )
    connection.sql("SHOW ALL TABLES").show()

    print(f"Describing table 'tlc_trips' ...")
    connection.sql(f"DESCRIBE TABLE tlc_trips").show()

    tlc_trips_length = connection.sql(
        "select count(*) as row_count from tlc_trips"
    ).df()["row_count"][0]
    print(f"The table 'tlc_trips' has {tlc_trips_length:,} rows")

print("✅ Loaded Taxi data")