In [14]:
#Install dependencies
%pip install -q sqlalchemy psycopg[binary] pandas python-dotenv #for...SQL engine, PostgreSQL driver, DataFrames, for loading .env files

Note: you may need to restart the kernel to use updated packages.


ERROR: Invalid requirement: '#for...SQL': Expected package name at the start of dependency specifier
    #for...SQL
    ^


In [15]:
#Quick sanity check
import sqlalchemy
import psycopg
import pandas
import dotenv

print("All imports OK")

All imports OK


In [16]:
#Load .env file
from dotenv import load_dotenv
load_dotenv()

True

In [17]:
#Create an engine (connection between Python and PostgreSQL) & test the connection
import os
from sqlalchemy import create_engine, text

DB_USER = os.getenv("PGUSER", "env_user")
DB_PASS = os.getenv("PGPASSWORD", "env_pass")
DB_NAME = os.getenv("PGDATABASE", "envdb")
DB_HOST = os.getenv("PGHOST", "localhost")
DB_PORT = os.getenv("PGPORT", "5433")  # change to "5433" if you remapped

engine = create_engine(
    f"postgresql+psycopg://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
)

with engine.connect() as conn:
    print(conn.execute(text("SELECT version();")).scalar_one())

PostgreSQL 16.4 (Debian 16.4-1.pgdg110+2) on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit


In [23]:
#Enable PostGIS
from sqlalchemy import text

with engine.begin() as conn:
    conn.execute(text("CREATE EXTENSION IF NOT EXISTS postgis;"))

In [None]:
#confirm you’re now talking to the Docker PostGIS container 
from sqlalchemy import create_engine, text
import os
from dotenv import load_dotenv

load_dotenv(override=True)

engine = create_engine(
    f"postgresql+psycopg://{os.getenv('PGUSER')}:{os.getenv('PGPASSWORD')}"
    f"@{os.getenv('PGHOST')}:{os.getenv('PGPORT')}/{os.getenv('PGDATABASE')}"
)

with engine.connect() as conn:
    print("server_version:", conn.execute(text("SELECT version();")).scalar_one())
    print("server_port   :", conn.execute(text("SELECT inet_server_port();")).scalar_one())
    print("postgis avail :", conn.execute(text(
        "SELECT array_agg(name) FROM pg_available_extensions WHERE name LIKE 'postgis%';"
    )).scalar_one())

server_version: PostgreSQL 16.4 (Debian 16.4-1.pgdg110+2) on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
server_port   : 5432
postgis avail : ['postgis_topology-3', 'postgis_sfcgal-3', 'postgis', 'postgis-3', 'postgis_raster-3', 'postgis_tiger_geocoder-3', 'postgis_topology', 'postgis_sfcgal', 'postgis_raster', 'postgis_tiger_geocoder']


In [22]:
#double-check you’re using host port 5433 from Python
import os
print("PGPORT from env:", os.getenv("PGPORT"))

PGPORT from env: 5433


In [None]:
# Inside Docker:
#PostgreSQL (with PostGIS) is listening on port 5432 inside the container.
#→ That’s why inet_server_port() returns 5432.

# On your host (Windows):
#Docker maps your host’s port 5433 → container’s port 5432.
#→ So from outside the container (Python, pgAdmin, etc.), you connect through localhost:5433.

#this allows you to work within a custom environment via Docker, distinct from your PC's native environment (the collection of settings/applications)

In [25]:
#Create water_quality table and load sample data

from sqlalchemy import text
import pandas as pd

ddl = """
CREATE TABLE IF NOT EXISTS water_quality (
    sample_id   INTEGER PRIMARY KEY,
    site        TEXT,
    date        DATE,
    parameter   TEXT,
    value       DOUBLE PRECISION,
    units       TEXT
);
"""
with engine.begin() as conn:
    conn.execute(text(ddl))

data = [
    (1,  "Willow Creek",    "2025-08-12", "NO3_N", 2.3,  "mg/L"),
    (2,  "Willow Creek",    "2025-09-02", "NO3_N", 1.8,  "mg/L"),
    (3,  "Willow Creek",    "2025-09-28", "NO3_N", 2.9,  "mg/L"),
    (4,  "Bear River",      "2025-08-15", "NO3_N", 0.9,  "mg/L"),
    (5,  "Bear River",      "2025-09-04", "NO3_N", 1.2,  "mg/L"),
    (6,  "Bear River",      "2025-09-20", "NO3_N", 0.7,  "mg/L"),
    (7,  "Columbia Slough", "2025-08-10", "NO3_N", 3.1,  "mg/L"),
    (8,  "Columbia Slough", "2025-08-30", "NO3_N", 2.7,  "mg/L"),
    (9,  "Columbia Slough", "2025-09-21", "NO3_N", 3.8,  "mg/L"),
    (10, "Willow Creek",    "2025-08-12", "TP",    0.08, "mg/L"),
    (11, "Willow Creek",    "2025-09-02", "TP",    0.05, "mg/L"),
    (12, "Bear River",      "2025-08-15", "TP",    0.03, "mg/L"),
    (13, "Bear River",      "2025-09-20", "TP",    0.04, "mg/L"),
    (14, "Columbia Slough", "2025-08-10", "TP",    0.11, "mg/L"),
    (15, "Columbia Slough", "2025-09-21", "TP",    0.09, "mg/L"),
]
df = pd.DataFrame(data, columns=["sample_id","site","date","parameter","value","units"])
df.to_sql("water_quality", engine, if_exists="replace", index=False)

pd.read_sql_query("SELECT COUNT(*) AS n_rows FROM water_quality;", engine)

Unnamed: 0,n_rows
0,15


In [None]:
#Quick queries(same SQL as SQLite, just via SQLAlchemy)

from sqlalchemy import text
import pandas as pd

def run_sql(sql: str, params: dict | None = None):
    with engine.connect() as conn:
        res = conn.execute(text(sql), params or {})
        return pd.DataFrame(res.fetchall(), columns=res.keys())

run_sql("""
SELECT site, date, parameter, value
FROM water_quality
WHERE parameter = 'NO3_N' AND value > :min_val
ORDER BY value DESC;
""", {"min_val": 2.0})


Unnamed: 0,site,date,parameter,value
0,Columbia Slough,2025-09-21,NO3_N,3.8
1,Columbia Slough,2025-08-10,NO3_N,3.1
2,Willow Creek,2025-09-28,NO3_N,2.9
3,Columbia Slough,2025-08-30,NO3_N,2.7
4,Willow Creek,2025-08-12,NO3_N,2.3


In [None]:

#Quick queries(same SQL as SQLite, just via SQLAlchemy)

run_sql("""
SELECT site, date, parameter, value
FROM water_quality
WHERE parameter = 'NO3_N' AND date BETWEEN :start AND :end
ORDER BY date;
""", {"start": "2025-08-01", "end": "2025-09-15"})

Unnamed: 0,site,date,parameter,value
0,Columbia Slough,2025-08-10,NO3_N,3.1
1,Willow Creek,2025-08-12,NO3_N,2.3
2,Bear River,2025-08-15,NO3_N,0.9
3,Columbia Slough,2025-08-30,NO3_N,2.7
4,Willow Creek,2025-09-02,NO3_N,1.8
5,Bear River,2025-09-04,NO3_N,1.2
