In [4]:
# Day 6 — Intro to SQL (SQLite) in Jupyter
#Goals: Practice `SELECT`, `WHERE`, `ORDER BY` on an environmental dataset.

In [5]:
#Standard imports
import sqlite3
from pathlib import Path
import pandas as pd

#Create (or open) a local SQLite database file in your repo/notebook folder
db_path = Path("env_day6.sqlite")
conn = sqlite3.connect(db_path)

#For nicer DataFrame display
pd.set_option("display.precision", 3)

print(f"Connected to SQLite at {db_path.resolve()}")

Connected to SQLite at C:\Users\chamb\env-data-analyst-oct\notebooks\env_day6.sqlite


In [6]:
#Build a tiny environmental dataset

#Minimal water quality dataset (NO3_N = nitrate as N)

data = [
    # sample_id, site,            date,        parameter, value, units
    (1,  "Willow Creek",    "2025-08-12", "NO3_N",   2.3,  "mg/L"),
    (2,  "Willow Creek",    "2025-09-02", "NO3_N",   1.8,  "mg/L"),
    (3,  "Willow Creek",    "2025-09-28", "NO3_N",   2.9,  "mg/L"),
    (4,  "Bear River",      "2025-08-15", "NO3_N",   0.9,  "mg/L"),
    (5,  "Bear River",      "2025-09-04", "NO3_N",   1.2,  "mg/L"),
    (6,  "Bear River",      "2025-09-20", "NO3_N",   0.7,  "mg/L"),
    (7,  "Columbia Slough", "2025-08-10", "NO3_N",   3.1,  "mg/L"),
    (8,  "Columbia Slough", "2025-08-30", "NO3_N",   2.7,  "mg/L"),
    (9,  "Columbia Slough", "2025-09-21", "NO3_N",   3.8,  "mg/L"),

    # Add a second parameter (TP = total phosphorus) to make WHERE filters interesting
    (10, "Willow Creek",    "2025-08-12", "TP",      0.08, "mg/L"),
    (11, "Willow Creek",    "2025-09-02", "TP",      0.05, "mg/L"),
    (12, "Bear River",      "2025-08-15", "TP",      0.03, "mg/L"),
    (13, "Bear River",      "2025-09-20", "TP",      0.04, "mg/L"),
    (14, "Columbia Slough", "2025-08-10", "TP",      0.11, "mg/L"),
    (15, "Columbia Slough", "2025-09-21", "TP",      0.09, "mg/L"),
]

cols = ["sample_id", "site", "date", "parameter", "value", "units"]
df = pd.DataFrame(data, columns=cols)
df.head()

Unnamed: 0,sample_id,site,date,parameter,value,units
0,1,Willow Creek,2025-08-12,NO3_N,2.3,mg/L
1,2,Willow Creek,2025-09-02,NO3_N,1.8,mg/L
2,3,Willow Creek,2025-09-28,NO3_N,2.9,mg/L
3,4,Bear River,2025-08-15,NO3_N,0.9,mg/L
4,5,Bear River,2025-09-04,NO3_N,1.2,mg/L


In [7]:
#Write the DataFrame into SQLite

#Write to SQLite(replace if already exists)
df.to_sql("water_quality", conn, if_exists="replace", index=False)  #Creates a "water_quality" table we can query with SQL

#Quick row count sanity check
pd.read_sql_query("SELECT COUNT(*) AS m_rows FROM water_quality;", conn)

Unnamed: 0,m_rows
0,15


In [8]:
#Helper: run_sql() to keep results tidy
def run_sql(sql: str, params: dict | tuple | None = None):  #takes an SQL query, runs that query in SQLite DB you connect to, and returns results as Pandas DataFrame (which displays nicely in Jupyter NB)
            #sql:str=query passed as string(see below), params:... optional extra data(like filters passed into the query): can be a dict (key-value pair), a tuple(stores values in order, not by key), None means if SQL query doesnt need parameters then skip it
    """                                                     
    Execute a SQL query and return a pretty DataFrame.
    Use name (e.g. :param) or positional (?) parameters via `param`
    """
    return pd.read_sql_query(sql, conn, params=params)

#Tiny smoke test
run_sql("SELECT * FROM water_quality LIMIT 5")

Unnamed: 0,sample_id,site,date,parameter,value,units
0,1,Willow Creek,2025-08-12,NO3_N,2.3,mg/L
1,2,Willow Creek,2025-09-02,NO3_N,1.8,mg/L
2,3,Willow Creek,2025-09-28,NO3_N,2.9,mg/L
3,4,Bear River,2025-08-15,NO3_N,0.9,mg/L
4,5,Bear River,2025-09-04,NO3_N,1.2,mg/L


In [9]:
#Practice: SELECT
#1 Everything (preview with LIMIT)
run_sql("""
SELECT *
FROM water_quality
LIMIT 5;
""")


Unnamed: 0,sample_id,site,date,parameter,value,units
0,1,Willow Creek,2025-08-12,NO3_N,2.3,mg/L
1,2,Willow Creek,2025-09-02,NO3_N,1.8,mg/L
2,3,Willow Creek,2025-09-28,NO3_N,2.9,mg/L
3,4,Bear River,2025-08-15,NO3_N,0.9,mg/L
4,5,Bear River,2025-09-04,NO3_N,1.2,mg/L


In [10]:
#Practice: SELECT
#2 Specify columns + alias
run_sql("""
SELECT
    sample_id,
    site,
    date,
    parameter AS param,
    value   AS result_mg_L
FROM water_quality
LIMIT 5;        
""")

Unnamed: 0,sample_id,site,date,param,result_mg_L
0,1,Willow Creek,2025-08-12,NO3_N,2.3
1,2,Willow Creek,2025-09-02,NO3_N,1.8
2,3,Willow Creek,2025-09-28,NO3_N,2.9
3,4,Bear River,2025-08-15,NO3_N,0.9
4,5,Bear River,2025-09-04,NO3_N,1.2


In [11]:
#Practice: SELECT
#3 Simple expression (conver mg/L -> ug/L by x1000) when parameter is TP
run_sql("""
SELECT
    site,
    date,
    parameter,
    value,
    CASE
        WHEN parameter = 'TP' THEN value * 1000.0
        ELSE NULL
    END AS value_ug_L
FROM water_quality
WHERE parameter = 'TP'
LIMIT 5;        
""")

Unnamed: 0,site,date,parameter,value,value_ug_L
0,Willow Creek,2025-08-12,TP,0.08,80.0
1,Willow Creek,2025-09-02,TP,0.05,50.0
2,Bear River,2025-08-15,TP,0.03,30.0
3,Bear River,2025-09-20,TP,0.04,40.0
4,Columbia Slough,2025-08-10,TP,0.11,110.0


In [12]:
#WHERE with comparisons, AND/OR, IN, BETWEEN, LIKE

#1 Filter by single equality
run_sql("""
SELECT site, date, parameter, value
FROM water_quality
WHERE site = :site AND parameter = 'NO3_N'
ORDER BY date;
""", params={"site": "Bear River"})

Unnamed: 0,site,date,parameter,value
0,Bear River,2025-08-15,NO3_N,0.9
1,Bear River,2025-09-04,NO3_N,1.2
2,Bear River,2025-09-20,NO3_N,0.7


In [13]:
#WHERE with comparisons, AND/OR, IN, BETWEEN, LIKE
#2 Numeric comparison (>), with AND ordered by value DESC
run_sql("""
SELECT site, date, parameter, value
FROM water_quality
WHERE parameter = 'NO3_N' AND value > :min_val
ORDER BY value DESC;
""", params={"min_val": 2.0})

Unnamed: 0,site,date,parameter,value
0,Columbia Slough,2025-09-21,NO3_N,3.8
1,Columbia Slough,2025-08-10,NO3_N,3.1
2,Willow Creek,2025-09-28,NO3_N,2.9
3,Columbia Slough,2025-08-30,NO3_N,2.7
4,Willow Creek,2025-08-12,NO3_N,2.3


In [14]:
#WHERE with comparisons, AND/OR, IN, BETWEEN, LIKE
#3 BETWEEN on ISO dates (Lexicographic works for 'YYYY-MM-DD')
run_sql("""
SELECT site, date, parameter, value
        FROM water_quality
        WHERE parameter = 'NO3_N'
            AND date BETWEEN :start AND :end
        ORDER BY date;
""", params={"start": "2025-08-01", "end": "2025-09-15"})

Unnamed: 0,site,date,parameter,value
0,Columbia Slough,2025-08-10,NO3_N,3.1
1,Willow Creek,2025-08-12,NO3_N,2.3
2,Bear River,2025-08-15,NO3_N,0.9
3,Columbia Slough,2025-08-30,NO3_N,2.7
4,Willow Creek,2025-09-02,NO3_N,1.8
5,Bear River,2025-09-04,NO3_N,1.2


In [15]:
#WHERE with comparisons, AND/OR, IN, BETWEEN, LIKE
#4 IN list (multiple sites)
run_sql("""
SELECT site, date, parameter, value
FROM water_quality
WHERE parameter = 'TP'
    AND site IN (:s1, :s2)
ORDER BY site, date;
""", params={"s1":"Willow Creek", "s2": "Columbia Slough"})

Unnamed: 0,site,date,parameter,value
0,Columbia Slough,2025-08-10,TP,0.11
1,Columbia Slough,2025-09-21,TP,0.09
2,Willow Creek,2025-08-12,TP,0.08
3,Willow Creek,2025-09-02,TP,0.05


In [16]:
#WHERE with comparisons, AND/OR, IN, BETWEEN, LIKE
#5 LIKE pattern (string match) - % = wildcard
run_sql("""
SELECT site, date, parameter, value
        FROM water_quality
        WHERE site LIKE :pat
        AND parameter = 'NO3_N'
        ORDER BY site, date;
""", params={"pat": "%Slough"})

Unnamed: 0,site,date,parameter,value
0,Columbia Slough,2025-08-10,NO3_N,3.1
1,Columbia Slough,2025-08-30,NO3_N,2.7
2,Columbia Slough,2025-09-21,NO3_N,3.8


In [17]:
#ORDER BY ascending/descending + multi-column sort
#1 Highest NO3_N first (DESC)
run_sql("""
SELECT site, date, value
FROM water_quality
WHERE parameter = 'NO3_N'
ORDER BY value DESC;
""")

Unnamed: 0,site,date,value
0,Columbia Slough,2025-09-21,3.8
1,Columbia Slough,2025-08-10,3.1
2,Willow Creek,2025-09-28,2.9
3,Columbia Slough,2025-08-30,2.7
4,Willow Creek,2025-08-12,2.3
5,Willow Creek,2025-09-02,1.8
6,Bear River,2025-09-04,1.2
7,Bear River,2025-08-15,0.9
8,Bear River,2025-09-20,0.7


In [19]:
#ORDER BY ascending/descending + multi-column sort
#2 Sort by site (A->Z), then by date (newest first)
run_sql("""
SELECT site, date, parameter, value
FROM water_quality
ORDER BY site ASC, date DESC;
""")

Unnamed: 0,site,date,parameter,value
0,Bear River,2025-09-20,NO3_N,0.7
1,Bear River,2025-09-20,TP,0.04
2,Bear River,2025-09-04,NO3_N,1.2
3,Bear River,2025-08-15,NO3_N,0.9
4,Bear River,2025-08-15,TP,0.03
5,Columbia Slough,2025-09-21,NO3_N,3.8
6,Columbia Slough,2025-09-21,TP,0.09
7,Columbia Slough,2025-08-30,NO3_N,2.7
8,Columbia Slough,2025-08-10,NO3_N,3.1
9,Columbia Slough,2025-08-10,TP,0.11


In [None]:
#Putting it together
#SELECT + WHERE + ORDER BY + LIMIT

#Top 3 nitrate results from Bear River in late summer/fall 2025
run_sql("""
SELECT site, date, parameter, value
FROM water_quality
WHERE parameter = 'NO3_N'
    AND site = :site
    AND date BETWEEN :start AND :end
ORDER BY value DESC
LIMIT 3;
""", params={"site": "Bear River", "start": "2025-08-01", "end": "2025-10-31"})

Unnamed: 0,site,date,parameter,value
0,Bear River,2025-09-04,NO3_N,1.2
1,Bear River,2025-08-15,NO3_N,0.9
2,Bear River,2025-09-20,NO3_N,0.7


In [None]:
#Useful Extras

#DISTINCT (selects for listing all the unique values)
run_sql("""
SELECT DISTINCT site
FROM water_quality
ORDER BY site
""")

Unnamed: 0,site
0,Bear River
1,Columbia Slough
2,Willow Creek


In [22]:
#Useful Extras

#Create folder for CSV files in working folder
csv_folder = Path("csv")
csv_folder.mkdir(exist_ok=True)

#Run query and save results
result = run_sql("""
SELECT *
FROM water_quality
WHERE parameter = 'TP'
ORDER BY value DESC;
""")

#Define file path inside the csv folder
csv_path = csv_folder / "tp_results.csv"

#Save DataFrame as CSV
result.to_csv(csv_path, index=False)

print("fCSV file saved to: {csv_path.resolve()}")

#Display first few rows
result.head()


fCSV file saved to: {csv_path.resolve()}


Unnamed: 0,sample_id,site,date,parameter,value,units
0,14,Columbia Slough,2025-08-10,TP,0.11,mg/L
1,15,Columbia Slough,2025-09-21,TP,0.09,mg/L
2,10,Willow Creek,2025-08-12,TP,0.08,mg/L
3,11,Willow Creek,2025-09-02,TP,0.05,mg/L
4,13,Bear River,2025-09-20,TP,0.04,mg/L
