In [None]:
#Postgres Quick STart in Jupyter (VS Code)
#Goal: Connect to PostgreSQL with SQLAlchemy/psycopg, load a small dataset, and run SELECT/WHERE/ORDER BY.
#note I created a DB called envdb, along with env_user and env_pass in the console prior to this notebook

In [1]:
#Install/verify Python deps
%pip install -q sqlalchemy psycopg[binary] pandas python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [None]:
#Connect to PostgreSQL
import os   #lets python interact with computers OS (reading environmental variables: u/n, p/w, db name)
import pandas as pd #   (python's data analysis and tabluar data library, reading queries into DataFrames, displaying results, exporting data to csv)
from sqlalchemy import create_engine, text #database connector library (talks to databases in consistent way whether dqlite, postgresql, or others. Creates connection engine/link between python and our db)

DB_USER = os.getenv("PGUSER", "env_user") #try and get environmental variable PGUSER, if doesnt exist use env_user (a user we already created)
DB_PASS = os.getenv("PGPASSWORD", "env_pass")
DB_NAME = os.getenv("PGDATABASE", "envdb")
DB_HOST = os.getenv("PGHOST", "localhost")
DB_PORT = os.getenv("PGPORT", "5432")

engine = create_engine(f"postgresql+psycopg://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}") #creates connection to a DB (connection object), connecting python and postgreSQL (to make connections, queries, tables)
pd.set_option("display.precision", 3)

#smoke test: open/close a connection
with engine.connect() as conn:
    version = conn.execute(text("SELECT version();")).scalar_one()
print("Connected to:", version.split(" on ")[0])



Connected to: PostgreSQL 18.0


In [7]:
#Create a table (schema)

ddl = """
CREATE TABLE IF NOT EXISTS water_quality (
    sample_id   INTEGER PRIMARY KEY,
    site        TEXT,
    date        DATE,
    parameter   TEXT,
    value       DOUBLE PRECISION,
    units       TEXT
);
"""

with engine.begin() as conn:
    conn.execute(text(ddl))
print("Table ensured: water_quality")

Table ensured: water_quality


In [None]:
#Load example data

#Define data and column names in object
data = [
    (1,  "Willow Creek",    "2025-08-12", "NO3_N", 2.3,  "mg/L"),
    (2,  "Willow Creek",    "2025-09-02", "NO3_N", 1.8,  "mg/L"),
    (3,  "Willow Creek",    "2025-09-28", "NO3_N", 2.9,  "mg/L"),
    (4,  "Bear River",      "2025-08-15", "NO3_N", 0.9,  "mg/L"),
    (5,  "Bear River",      "2025-09-04", "NO3_N", 1.2,  "mg/L"),
    (6,  "Bear River",      "2025-09-20", "NO3_N", 0.7,  "mg/L"),
    (7,  "Columbia Slough", "2025-08-10", "NO3_N", 3.1,  "mg/L"),
    (8,  "Columbia Slough", "2025-08-30", "NO3_N", 2.7,  "mg/L"),
    (9,  "Columbia Slough", "2025-09-21", "NO3_N", 3.8,  "mg/L"),
    (10, "Willow Creek",    "2025-08-12", "TP",    0.08, "mg/L"),
    (11, "Willow Creek",    "2025-09-02", "TP",    0.05, "mg/L"),
    (12, "Bear River",      "2025-08-15", "TP",    0.03, "mg/L"),
    (13, "Bear River",      "2025-09-20", "TP",    0.04, "mg/L"),
    (14, "Columbia Slough", "2025-08-10", "TP",    0.11, "mg/L"),
    (15, "Columbia Slough", "2025-09-21", "TP",    0.09, "mg/L"),
]
cols = ["sample_id", "site", "date", "parameter", "value", "units"]
df = pd.DataFrame(data, columns=cols)   #convert data/column names to DataFrame object in Pandas

# Push dataframe into Postgres (replace table if it exists)
df.to_sql("water_quality", engine, if_exists="replace", index=False)
len(df)


15

In [13]:
#Helper to run SQL (Postgres version) - makes repetitive tasks easier and cleaner (combines code for connecting, executing, and formatting results requiring less code for our subsequent queries)
#Note this helper function is written so we can continue to use :placeholders in our queries if we want, identical to SQLite.  This keeps us from needing to change this convention when switching convention when writing queries in both SQLite and PostgreSQL

def run_sql(sql: str, params: dict | None = None): #sql queries written python string; params can key:value, or neither
    with engine.connect() as conn:
        result = conn.execute(text(sql), params or {})
        df = pd.DataFrame(result.fetchall(), columns=result.keys())
    return df

#Smoke test
run_sql("SELECT COUNT(*) AS n_rows FROM water_quality")

Unnamed: 0,n_rows
0,15


In [15]:
#Contrast queries (same SQL I used for SQLite)

#WHERE + ORDER BY (same syntax)
run_sql("""
SELECT site, date, parameter, value
        FROM water_quality
        WHERE parameter = 'NO3_N' AND value > :min_val
        ORDER BY value DESC;
""", {"min_val": 2.0})

#note difference in how variables are represented in postgre %s is the expression for variable, and (2.0,) is a tuple being passed

Unnamed: 0,site,date,parameter,value
0,Columbia Slough,2025-09-21,NO3_N,3.8
1,Columbia Slough,2025-08-10,NO3_N,3.1
2,Willow Creek,2025-09-28,NO3_N,2.9
3,Columbia Slough,2025-08-30,NO3_N,2.7
4,Willow Creek,2025-08-12,NO3_N,2.3


In [16]:
#Contrast queries

#BETWEEN on real DATE type (works the same; now its a true date)
run_sql("""
SELECT site, date, parameter, value
FROM water_quality
WHERE parameter = 'NO3_N'
    AND date BETWEEN :start AND :end
ORDER BY date;
""", {"start": "2025-08-01", "end": "2025-09-15"})

Unnamed: 0,site,date,parameter,value
0,Columbia Slough,2025-08-10,NO3_N,3.1
1,Willow Creek,2025-08-12,NO3_N,2.3
2,Bear River,2025-08-15,NO3_N,0.9
3,Columbia Slough,2025-08-30,NO3_N,2.7
4,Willow Creek,2025-09-02,NO3_N,1.8
5,Bear River,2025-09-04,NO3_N,1.2


In [17]:
#ORDER BY examples

# Highest NO3_N first
run_sql("""
SELECT site, date, value
FROM water_quality
WHERE parameter = 'NO3_N'
ORDER BY value DESC;
""")

Unnamed: 0,site,date,value
0,Columbia Slough,2025-09-21,3.8
1,Columbia Slough,2025-08-10,3.1
2,Willow Creek,2025-09-28,2.9
3,Columbia Slough,2025-08-30,2.7
4,Willow Creek,2025-08-12,2.3
5,Willow Creek,2025-09-02,1.8
6,Bear River,2025-09-04,1.2
7,Bear River,2025-08-15,0.9
8,Bear River,2025-09-20,0.7


In [18]:
#ORDER BY examples

# Sort by site A→Z, then by date newest→oldest
run_sql("""
SELECT site, date, parameter, value
FROM water_quality
ORDER BY site ASC, date DESC;
""")

Unnamed: 0,site,date,parameter,value
0,Bear River,2025-09-20,NO3_N,0.7
1,Bear River,2025-09-20,TP,0.04
2,Bear River,2025-09-04,NO3_N,1.2
3,Bear River,2025-08-15,NO3_N,0.9
4,Bear River,2025-08-15,TP,0.03
5,Columbia Slough,2025-09-21,TP,0.09
6,Columbia Slough,2025-09-21,NO3_N,3.8
7,Columbia Slough,2025-08-30,NO3_N,2.7
8,Columbia Slough,2025-08-10,NO3_N,3.1
9,Columbia Slough,2025-08-10,TP,0.11


In [19]:
#Save results to a csv/folder

from pathlib import Path

csv_folder = Path("csv"); csv_folder.mkdir(exist_ok=True)

result = run_sql("""
SELECT *
FROM water_quality
WHERE parameter = 'TP'
ORDER BY value DESC;
""")

out_path = csv_folder / "tp_results_postgres.csv"
result.to_csv(out_path, index=False)
f"Saved: {out_path.resolve()}"

'Saved: C:\\Users\\chamb\\env-data-analyst-oct\\notebooks\\csv\\tp_results_postgres.csv'