In [None]:
import duckdb

# Load SQL extension
%load_ext sql

# Initialize 🦆 DuckDB connection
conn = duckdb.connect()

# Import database
%sql conn --alias duckdb
%sql IMPORT DATABASE '../../data/nps';

The basics

In [None]:
%%sql
SELECT
    states,
    COUNT(*) as num_parks
FROM nps_public_data.parks p
WHERE designation = 'National Park'
GROUP BY 1
ORDER BY 2 DESC
LIMIT 20;

In [None]:
%%sql
WITH park_list AS (
    SELECT
        fullname,
        UNNEST(
            SPLIT(states, ',')::string[]
            ) as state
    FROM nps_public_data.parks p
    WHERE designation = 'National Park'
)
SELECT
    state,
    COUNT(*) as num_parks
FROM park_list
GROUP BY 1
ORDER BY 2 DESC, 1
LIMIT 10

How do we find the campgrounds with the least and most sites using aggregations?

In [None]:
%%sql
WITH park_campgrounds AS (
    SELECT
        c.name as campgroud_name,
        p.fullname as park_name,
        c.numberofsitesfirstcomefirstserve + c.numberofsitesreservable as total_sites,
    FROM nps_public_data.campgrounds c
    INNER JOIN nps_public_data.parks p
        ON c.parkcode = p.parkcode
        AND p.designation = 'National Park'
    GROUP BY 1,2,3
), min_max_sites AS (
SELECT
    MIN(total_sites) as min_sites,
    MAX(total_sites) as max_sites
FROM park_campgrounds
WHERE total_sites > 0
)
SELECT
    campgroud_name,
    total_sites as num_sites,
    CASE total_sites WHEN min_sites THEN 'least' ELSE 'most' END as sites_rank
FROM park_campgrounds pc
INNER JOIN min_max_sites mms
    ON (pc.total_sites = mms.min_sites OR pc.total_sites = mms.max_sites)
ORDER BY num_sites, campgroud_name

What about the parks?

In [None]:
%%sql
WITH park_campgrounds AS (
    SELECT
        c.name as campgroud_name,
        p.fullname as park_name,
        c.numberofsitesfirstcomefirstserve + c.numberofsitesreservable as total_sites,
    FROM nps_public_data.campgrounds c
    INNER JOIN nps_public_data.parks p
        ON c.parkcode = p.parkcode
        AND p.designation = 'National Park'
    GROUP BY 1,2,3
), park_sites AS (
    SELECT
        park_name,
        SUM(total_sites) as num_sites
    FROM park_campgrounds
    GROUP BY 1
    ORDER BY 2 DESC
), min_max_sites AS (
    SELECT
        MIN(num_sites) as min_sites,
        MAX(num_sites) as max_sites
    FROM park_sites ps
)
SELECT
    ps.*,
    CASE num_sites WHEN min_sites THEN 'least' ELSE 'most' END as sites_rank
FROM park_sites ps
INNER JOIN min_max_sites mms
    ON (num_sites = mms.min_sites or num_sites = mms.max_sites)
ORDER BY ps.num_sites DESC

Other basic aggregations

In [None]:
%%sql
SELECT
    p.fullname as park_name,
    -- COUNT the number of campgrounds
    COUNT(DISTINCT c.name) as num_campgrounds,
    -- Get the average number of sites— what is this returning?
    ROUND(AVG(numberofsitesreservable), 2) as avg_sites_reservable,
    ROUND(AVG(numberofsitesfirstcomefirstserve), 2) as avg_sites_fcfs,
    ROUND(AVG(numberofsitesreservable + numberofsitesfirstcomefirstserve), 2) as avg_total_sites
FROM nps_public_data.campgrounds c
INNER JOIN nps_public_data.parks p
    ON c.parkcode = p.parkcode
    AND p.designation = 'National Park'
GROUP BY 1
ORDER BY 2 DESC
LIMIT 10
-- Read more about aggregates here: https://duckdb.org/docs/sql/aggregates

In [None]:
%%sql
-- We can also use CASE WHEN inside a COUNT to count the number of campgrounds that meet a certain condition
SELECT
    p.name as park_name,
    COUNT(CASE WHEN c.numberofsitesreservable > 0 THEN 1 END) as num_reservable_campgrounds,
    COUNT(CASE WHEN c.numberofsitesfirstcomefirstserve > 0 THEN 1 END) as num_fcfs_campgrounds,
FROM nps_public_data.campgrounds c
INNER JOIN nps_public_data.parks p
    ON c.parkcode = p.parkcode
    AND p.designation = 'National Park'
GROUP BY 1
ORDER BY 2 DESC
LIMIT 5

There's lots more to aggregations, but they're pretty simple. The best way to get started will be to aggregate data in practice. Remember the basics:
- Aggregations collapse rows.
- Rows that _aren't_ being aggregated must be `GROUP`ed.
- `GROUP` statements appear at the end of a query.
- Duplicate rows may skew aggregates if not properly accounted for.