In [40]:
import duckdb as ddb


In [41]:
# To interact with S3 (AWS) we'll install httpfs to allow data download from https and http connections
ddb.sql("INSTALL httpfs; LOAD httpfs")

In [42]:
# Connect to a Duckdb database. This will create the database file in our fie structure
con = ddb.connect("../air_quality.db")
con

<duckdb.duckdb.DuckDBPyConnection at 0x10733e4f0>

In [43]:
# Create a new schema in the database
# The .execute directly executes the sql
# 'raw' is what we are naming this schema
con.execute("CREATE schema IF NOT EXISTS raw")

<duckdb.duckdb.DuckDBPyConnection at 0x10733e4f0>

In [44]:
# When using AWS S3, Duckdb looks for credentials.
# The S3 we're using is public
# But we'll still need the credential fields 
con.sql(""" 
    SET s3_access_key_id='';
    SET s3_secret_access_key='';
    SET s3_region='';
""")


In [50]:
# Create Table
# Note: Words with double quotes, distinquishes between ddb's key words
con.execute("""
    CREATE TABLE raw.air_quality (
        location_id INTEGER,
        sensors_id INTEGER,
        "location" TEXT,
        "datetime" TIMESTAMP,
        lat DOUBLE,
        lon DOUBLE,
        "parameter" TEXT,
        units TEXT,
        "value" DOUBLE,
        "month" INTEGER,
        "year" INTEGER,
        ingestion_datetime TIMESTAMP DEFAULT current_timestamp
    );
""")


ConnectionException: Connection Error: Connection already closed!

In [None]:
con.execute("""
INSERT INTO raw.air_quality
SELECT 
    location_id, 
    sensors_id, 
    "location", 
    "datetime", 
    lat, 
    lon, 
    "parameter", 
    units, 
    "value",
    "month", 
    "year",
    current_timestamp AS ingestion_datetime
FROM read_csv('s3://openaq-data-archive/records/csv.gz/locationid=2009/year=2024/month=01/*.csv.gz');
""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<duckdb.duckdb.DuckDBPyConnection at 0x10733e4f0>

In [49]:
# Close the connection to Duckdb
# Note: Duckdb does not allow concurrent writers to the db
con.close()