In [1]:
#Imports and configuration
# Keep this at the top of the notebook
import os
from pathlib import Path
import pandas as pd
import geopandas as gpd
from sqlalchemy import create_engine, text
from dotenv import load_dotenv

# Load .env if you keep secrets there
load_dotenv()

# Connection values — prefer env vars; fallback to sensible defaults
PGUSER = os.getenv("PGUSER", "env_user")
PGPASS = os.getenv("PGPASSWORD", "env_pass")
PGHOST = os.getenv("PGHOST", "localhost")     # if running in another container, use the service name
PGPORT = int(os.getenv("PGPORT", "5433"))
PGDB   = os.getenv("PGDATABASE", "envdb")

SQLALCHEMY_URL = f"postgresql+psycopg2://{PGUSER}:{PGPASS}@{PGHOST}:{PGPORT}/{PGDB}"
engine = create_engine(SQLALCHEMY_URL, future=True)

DATA_PATH = Path("data") / "water_quality_samples.csv"  # adjust to your CSV
SCHEMA = "wq"   # a tidy schema for this exercise


In [4]:
#Generate synthetic dataset
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from pathlib import Path

np.random.seed(42)
n = 1000

stations = [
    ("Willamette_001", 44.95, -123.03),
    ("Columbia_002", 45.64, -122.75),
    ("Rogue_003", 42.43, -122.85),
    ("Deschutes_004", 44.06, -121.31),
    ("Umpqua_005", 43.40, -123.32),
]

parameters = ["Nitrate", "Phosphate", "Dissolved_Oxygen", "pH", "Temperature"]
units = {"Nitrate": "mg/L", "Phosphate": "mg/L", "Dissolved_Oxygen": "mg/L", "pH": "pH", "Temperature": "°C"}

start_date = datetime(2024, 1, 1)
end_date = datetime(2025, 1, 1)
date_range = [start_date + timedelta(days=int(x)) for x in np.random.randint(0, 365, n)]

rows = []
for i in range(n):
    station, lat, lon = stations[np.random.randint(0, len(stations))]
    param = np.random.choice(parameters)
    val = round(abs(np.random.normal(loc=5 if param != "pH" else 7, scale=2)), 2)
    rows.append([
        i + 1,
        station,
        lat + np.random.normal(0, 0.01),
        lon + np.random.normal(0, 0.01),
        date_range[i].strftime("%Y-%m-%d"),
        param,
        val,
        units[param],
    ])

df = pd.DataFrame(rows, columns=["id","station","lat","lon","sample_date","parameter","value","unit"])
df.head()


Unnamed: 0,id,station,lat,lon,sample_date,parameter,value,unit
0,1,Rogue_003,42.425925,-122.85602,2024-04-12,Dissolved_Oxygen,4.7,mg/L
1,2,Umpqua_005,43.405298,-123.305584,2024-12-14,Nitrate,6.52,mg/L
2,3,Rogue_003,42.436438,-122.84316,2024-09-27,Nitrate,5.9,mg/L
3,4,Rogue_003,42.424916,-122.862418,2024-04-16,Nitrate,4.71,mg/L
4,5,Willamette_001,44.951137,-123.035945,2024-03-12,pH,6.76,pH


In [6]:
#Export dataset to CSV
DATA_PATH = Path("data")
DATA_PATH.mkdir(exist_ok=True)

csv_path = DATA_PATH / "water_quality_samples.csv"
df.to_csv(csv_path, index=False)

csv_path, df.shape


(WindowsPath('data/water_quality_samples.csv'), (1000, 8))