# Caclulate statistics per district using DuckDB

**Author**: Willeke A'Campo

**Description:** This notebooks shows how to calculate the Ecosystem Service statistics per district using DuckDB. The results are stored in a new table in the database and exported to GeoJSON.

**Documentation:** 

### Data conversion | GeoJSON to GeoParquet   

In [1]:
import geopandas as gpd
from shapely.geometry import Point
from shapely.wkb import loads
import pyarrow
import os
import leafmap
import os
import duckdb
import pandas as pd

municipality = "kristiansand"
raw_dir = r"/workspaces/urban-climate/data/01_raw"
interim_dir = r"/workspaces/urban-climate/data/02_intermediate"

# Define the table names
file_names = [
    f"{municipality}_study_area", 
    f"{municipality}_districts",
    f"{municipality}_bldg",
    f"{municipality}_res_bldg",
    f"{municipality}_green_space",
    f"{municipality}_open_space",
    f"{municipality}_public_open_space",
    f"{municipality}_private_open_space",
    f"{municipality}_tree_crowns"
    ]

table_names = [
    "study_area", "districts", "bldg", "res_bldg", "green_space",
    "open_space", "public_open_space", "private_open_space", "tree_crowns"
    ]

# Define the parquet_dict
parquet_dict = {
    name: os.path.join(interim_dir, f"{name}.parquet") 
    for name in file_names}

# Check if the parquet files exist, if not convert  to parquet
for key in parquet_dict.keys():
    if os.path.exists(parquet_dict[key]):
        continue
    else:
        # Define the gdf_dict
        gdf_dict = {
            name: gpd.read_file(os.path.join(raw_dir, f"{name}.geojson")) 
            for name in file_names}

        # Convert GeoDataFrame to Parquet
        for key, gdf in gdf_dict.items():
            gdf.to_parquet(
                path = interim_dir + "/" + key + ".parquet",
                index = None, 
                compression = "snappy"
            )

In [2]:
# Create a connection to the DuckDB database
con = duckdb.connect(database=":memory:", read_only=False)
con.install_extension("spatial")
con.load_extension("spatial")

# Create a table for each parquet file or GeoDataFrame
for key,table in zip(parquet_dict.keys(), table_names):
    con.execute(
        f"""
        CREATE TABLE {table} 
        AS SELECT *, ST_GeomFromWKB(geometry) 
        FROM parquet_scan('{parquet_dict[key]}')
        """
        )
    

# Fetch and print all table names
result = con.execute(
    """
    SELECT table_name 
    FROM information_schema.tables 
    WHERE table_schema = 'main'
    """
    )

print(result.fetchall())

[('study_area',), ('districts',), ('bldg',), ('res_bldg',), ('green_space',), ('open_space',), ('public_open_space',), ('private_open_space',), ('tree_crowns',)]


In [3]:
# Check if the 'crowns' table exists
table_exists = "tree_crowns" in [
    row[0]
    for row in con.execute(
        """
        SELECT table_name 
        FROM information_schema.tables 
        WHERE table_schema = 'main'
        """
    ).fetchall()
]

if table_exists:
    # convert dtype to DuckDB GEOMETRY
    result = con.execute(
        """
        SELECT 
        ST_X(ST_Centroid(ST_GeomFromWKB(geometry))), 
        ST_Y(ST_Centroid(ST_GeomFromWKB(geometry))) 
        FROM tree_crowns"""
        )
    # xy_crowns to pd
    df = pd.DataFrame(result.fetchall(), columns=["X", "Y"])
    
    xy_crowns = gpd.GeoDataFrame(
        df,
        geometry= gpd.points_from_xy(df.X, df.Y)
        )
    xy_crowns.crs = "EPSG:25832"


    # Create a new table in DuckDB
    con.execute(
        """
        CREATE TABLE tree_crowns_xy AS
        SELECT 
        ST_X(ST_Centroid(ST_GeomFromWKB(geometry))) AS X, 
        ST_Y(ST_Centroid(ST_GeomFromWKB(geometry))) AS Y,
        ST_Point(ST_X(ST_Centroid(ST_GeomFromWKB(geometry))), ST_Y(ST_Centroid(ST_GeomFromWKB(geometry)))) AS geometry
        FROM tree_crowns"""
    )

In [4]:
# add layers to gdf for mapping
gdf_study_area = leafmap.read_parquet(
    parquet_dict[f"{municipality}_study_area"], 
    return_type='gdf', 
    src_crs="EPSG:25832", 
    dst_crs="EPSG:4326"
    )

# convert xy_crowns to wgs84
xy_crowns_sample = xy_crowns.sample(frac=0.05)
trees_xy = xy_crowns_sample.to_crs("EPSG:4326")
points_geojson = trees_xy.__geo_interface__
print("Map the tree crown center points (10% sample).")
print(trees_xy.head(2))

# Calculate the center of the study_area GeoDataFrame
center = gdf_study_area.geometry.unary_union.centroid

# --------------------------------------------------
# INIT MAP
# --------------------------------------------------
map = leafmap.Map()
# center
map.set_center(center.x, center.y, zoom=14)
# add Basemap
map.add_basemap("CartoDB.Positron")

# add study area as vector layer
map.add_gdf(
        gdf_study_area, 
        layer_name="study_area", 
        get_fill_color=[0, 0, 255, 128]
        )

map.add_gdf(
    trees_xy,
    layer_name ="trees",
    color= "black"
)

map.add_legend(
                legend_title="Legend", 
                legend_dict={
                    "Study area": "blue", 
                    "Tree crowns(10% sample)": "blue"
                    }
                )

map

Map the tree crown center points (10% sample).
                  X             Y                  geometry
6264  441689.446294  6.446823e+06  POINT (8.00910 58.15884)
3588  440649.588828  6.445523e+06  POINT (7.99176 58.14703)


Map(center=[58.15207603451838, 8.004060328655429], controls=(ZoomControl(options=['position', 'zoom_in_text', …

### DuckDB Connection

You can use the following commands to load your data into DuckDB database
- `con.sql` executes a SQL query writes it to a DuckDB database 
- `con.execute` executes a SQL query and returns the result as a duckdb.DuckDBPyResult object
   - con.execut("CREATE TABLE ...") to create a new table
   - con.execute("INSERT INTO ...") to insert data into a table
   - con.execute("COPY ... FROM ...") to load data from a file into a table
- `gdf.to_sql` to write a GeoDataFrame to a table in the DuckDB database

con.execute("""
COPY table_name 
FROM 'path_to_your_file.parquet' 
WITH (FORMAT 'PARQUET')
""")

**Create a new Table with Tree Crown Center Points**

**Split open space, private open space and public open space by district**

In [5]:

columns = con.execute("PRAGMA table_info(districts)").fetchall()
for column in columns:
    print(column[1])


OBJECTID
fylkesnummer
fylkesnavn
kommunenummer
kommunenavn
delomradenummer
delomradenavn
grunnkretsnummer
grunnkretsnavn
kilde_admin
kilde_befolkning
id_befolkning
year_pop_stat
pop_total
pop_elderly
a_district
a_unit
a_clipped
SHAPE_Length
SHAPE_Area
geometry
st_geomfromwkb(geometry)


In [6]:
# create new table split_open_space 
# with open space split by district boundaries
con.execute(
    """
    CREATE TABLE split_open_space AS 
    SELECT
        districts.grunnkretsnummer,
        ST_Intersection(ST_GeomFromWKB(districts.geometry), ST_GeomFromWKB(open_space.geometry)) AS geom
    FROM 
        districts, open_space
    WHERE
        ST_Intersects(ST_GeomFromWKB(districts.geometry), ST_GeomFromWKB(open_space.geometry));
    """
    )

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<duckdb.duckdb.DuckDBPyConnection at 0x7f2864390030>

In [7]:
# create new table split_buildings
# with buildings split by district boundaries
con.execute(
    """
    CREATE TABLE split_bldg AS 
    SELECT
        districts.grunnkretsnummer,
        ST_Intersection(ST_GeomFromWKB(districts.geometry), ST_GeomFromWKB(bldg.geometry)) AS geom
    FROM 
        districts, bldg
    WHERE
        ST_Intersects(ST_GeomFromWKB(districts.geometry), ST_GeomFromWKB(bldg.geometry));
    """
    )

<duckdb.duckdb.DuckDBPyConnection at 0x7f2864390030>

In [8]:
# create new table split_res_bldg
# with buildings split by district boundaries
con.execute(
    """
    CREATE TABLE split_res_bldg AS 
    SELECT
        districts.grunnkretsnummer,
        ST_Intersection(ST_GeomFromWKB(districts.geometry), ST_GeomFromWKB(res_bldg.geometry)) AS geom
    FROM 
        districts, res_bldg
    WHERE
        ST_Intersects(ST_GeomFromWKB(districts.geometry), ST_GeomFromWKB(res_bldg.geometry));
    """
    )

<duckdb.duckdb.DuckDBPyConnection at 0x7f2864390030>

In [9]:
# print columns split_open_space
columns = con.execute("PRAGMA table_info(split_open_space)").fetchall()
for column in columns:
    print(column[1])
    
# Query to fetch all table names
tables = con.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()

# Print the first row of each table
for table in tables:
    first_row = con.execute(f"SELECT * FROM {table[0]} LIMIT 1").fetchone()
    print(f"First row of {table[0]}: {first_row}")

grunnkretsnummer
geom
First row of split_res_bldg: ('42040919', b"\x02\x04I\x00\x00\x00\x00\x00\xcd\x9e\xd7H\x18\xc2\xc4J\xc4\xa1\xd7H@\xc2\xc4J\x02\x00\x00\x00\x01\x00\x00\x00\x12\x00\x00\x00\x00\x00\x00\x00P@\x13\xe1\xf8\xf3\x1aA\x1c\r\xe0\xf1G\x98XAp\xc5\xfe2\xfc\xf3\x1aA\xfc\x87\xf4\xbfG\x98XA\xd0+e\x99\xff\xf3\x1aA\xc0}\x1d\x8cG\x98XA \r\xe0\xad\xfe\xf3\x1aA\xd4+e}G\x98XAp\x9c\xa2\xa3\x0f\xf4\x1aA\xc0}\x1d|F\x98XA@\x92\xcb\xff\x1c\xf4\x1aA\xd4+e]G\x98XA \xe4\x83\x1e(\xf4\x1aADio\xb4F\x98XA\x90!\x8e\xf5/\xf4\x1aA\xd4+e=F\x98XA@iop8\xf4\x1aA\xcaT\xc1\xbcE\x98XA\xc0}\x1d\xb8\x10\xf4\x1aA\xde\x02\t\x1eC\x98XAp\xc5\xfe2\xff\xf3\x1aAx\x9c\xa2'D\x98XA@\x92\xcb\xff\xfc\xf3\x1aA0\xbb'\x03D\x98XA\x00\x88\xf4[\xf5\xf3\x1aAn\xc5\xfevD\x98XA0\xbb'\x8f\xf7\xf3\x1aA\xc0}\x1d\x9cD\x98XA\xc0}\x1d\xb8\xd9\xf3\x1aA\x06_\x98`F\x98XA\xe0\x02\t\n\xee\xf3\x1aAx\x9c\xa2\xb7G\x98XA\xf0\xb0P\xeb\xf1\xf3\x1aA\xcaT\xc1|G\x98XAP@\x13\xe1\xf8\xf3\x1aA\x1c\r\xe0\xf1G\x98XA")
First row of split_bldg: ('42040807'

### Generate Columns with Count Statistics

| Name | Alias | Description | Type |  Unit | 
| --- | --- | --- | --- | --- |
| n_trees | Antall trær | Number of trees in the district | INT |
| n_bldg | Antall bygninger | Number of buildings in the district | INT |
| n_res_bldg | Antall boliger | Number of residential buildings in the district | INT |
| n_res_bldg_near_gs | Antall boliger nær grøntområde (300 m) | Number of residential buildings near green space (300 m) | INT |
| n_trees_near_bldg | Antall trær nær boliger (15 m) | Number of trees near residential buildings (15 m) | INT |
| n_viewshed | Antall viewshed piksler | Number of viewshed pixels that intersect with the building edge | INT |



In [10]:
# add count columsn to district table 

# add count columns: n_trees, n_bldg, n_res_bldg, n_res_bldg_near_gs, n_trees_near_bldg
con.execute("ALTER TABLE districts ADD COLUMN n_trees INTEGER")
con.execute("ALTER TABLE districts ADD COLUMN n_bldg INTEGER")
con.execute("ALTER TABLE districts ADD COLUMN n_res_bldg INTEGER")
con.execute("ALTER TABLE districts ADD COLUMN n_res_bldg_near_gs INTEGER")
con.execute("ALTER TABLE districts ADD COLUMN n_trees_near_bldg INTEGER")

<duckdb.duckdb.DuckDBPyConnection at 0x7f2864390030>

In [11]:
# print columns districts
columns = con.execute("PRAGMA table_info(split_bldg)").fetchall()
for column in columns:
    print(column[1])

grunnkretsnummer
geom


In [28]:
# COUNT number of trees per district
n_trees = con.execute(
    """
    SELECT districts.grunnkretsnummer, COUNT(*) 
    FROM tree_crowns_xy 
    JOIN districts ON ST_Within(tree_crowns_xy.geometry, ST_GeomFromWKB(districts.geometry))
    GROUP BY districts.grunnkretsnummer
    """
    ).fetchall()

# COUNT number of buildings per district
n_bldg = con.execute(
    """
    SELECT districts.grunnkretsnummer, COUNT(*) 
    FROM split_bldg 
    JOIN districts ON ST_Within(split_bldg.geom, ST_GeomFromWKB(districts.geometry))
    GROUP BY districts.grunnkretsnummer
    """
    ).fetchall()

# COUNT number of residential buildings per district 
n_res_bldg = con.execute(
    """
    SELECT districts.grunnkretsnummer, COUNT(*) 
    FROM split_res_bldg 
    JOIN districts ON ST_Within(split_res_bldg.geom, ST_GeomFromWKB(districts.geometry))
    GROUP BY districts.grunnkretsnummer
    """
    ).fetchall()

# COUNT number of res buildings WITHIN 300m distance of green space
n_res_bldg_near_gs = con.execute(
    """
    SELECT districts.grunnkretsnummer, COUNT(*) 
    FROM res_bldg 
    JOIN districts ON ST_Within(ST_GeomFromWKB(res_bldg.geometry), ST_GeomFromWKB(districts.geometry))
    WHERE res_bldg.geometry IS NOT NULL AND districts.geometry IS NOT NULL AND EXISTS (
        SELECT 1
        FROM green_space
        WHERE green_space.geometry IS NOT NULL AND ST_DWithin(ST_GeomFromWKB(res_bldg.geometry), ST_GeomFromWKB(green_space.geometry), 300)
    )
    GROUP BY districts.grunnkretsnummer
    """
    ).fetchall()

# COUNT number of trees crowns WITHIN 15m distance of res buildings
n_trees_near_bldg = con.execute(
    """
    SELECT districts.grunnkretsnummer, COUNT(*) 
    FROM tree_crowns
    JOIN districts ON ST_Within(ST_GeomFromWKB(tree_crowns.geometry), ST_GeomFromWKB(districts.geometry))
    WHERE EXISTS (
        SELECT 1
        FROM res_bldg
        WHERE ST_DWithin(ST_GeomFromWKB(tree_crowns.geometry), ST_GeomFromWKB(res_bldg.geometry), 15)
        AND ST_Within(ST_GeomFromWKB(res_bldg.geometry), ST_GeomFromWKB(districts.geometry))
    )
    GROUP BY districts.grunnkretsnummer
    """
    ).fetchall()

# Update districts table
for id, count in n_trees:
    con.execute(f"UPDATE districts SET n_trees = {count} WHERE grunnkretsnummer = {id}")

for id, count in n_bldg:
    con.execute(f"UPDATE districts SET n_bldg = {count} WHERE grunnkretsnummer = {id}")

for id, count in n_res_bldg:
    con.execute(f"UPDATE districts SET n_res_bldg = {count} WHERE grunnkretsnummer = {id}")

for id, count in n_res_bldg_near_gs:
    con.execute(f"UPDATE districts SET n_res_bldg_near_gs = {count} WHERE grunnkretsnummer = {id}")

for id, count in n_trees_near_bldg:
    con.execute(f"UPDATE districts SET n_trees_near_bldg = {count} WHERE grunnkretsnummer = {id}")
    
# print columns districts
columns = con.execute("PRAGMA table_info(districts)").fetchall()
for column in columns:
    print(column[1])

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

OBJECTID
fylkesnummer
fylkesnavn
kommunenummer
kommunenavn
delomradenummer
delomradenavn
grunnkretsnummer
grunnkretsnavn
kilde_admin
kilde_befolkning
id_befolkning
year_pop_stat
pop_total
pop_elderly
a_district
a_unit
a_clipped
SHAPE_Length
SHAPE_Area
geometry
st_geomfromwkb(geometry)
n_trees
n_bldg
n_res_bldg
n_res_bldg_near_gs
n_trees_near_bldg


In [38]:
con.execute(
    """
    UPDATE districts
    SET n_res_bldg_near_gs = COALESCE(n_res_bldg_near_gs, 0)
    """
)

con.execute(
    """
    UPDATE districts
    SET n_bldg = COALESCE(n_bldg, 0)
    """
)

con.execute(
    """
    UPDATE districts
    SET n_res_bldg = COALESCE(n_res_bldg, 0)
    """
)

con.execute(
    """
    UPDATE districts
    SET n_trees_near_bldg = COALESCE(n_trees_near_bldg, 0)
    """
)

<duckdb.duckdb.DuckDBPyConnection at 0x7f2864390030>

In [39]:
# Export districts to DataFrame
df = pd.read_sql("SELECT * FROM districts", con)

# Convert geometry column from WKB to shapely geometry
df['geometry'] = df['geometry'].apply(loads, hex=True)

# Convert DataFrame to GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry='geometry')
gdf_sorted = gdf.sort_values(by='n_bldg', ascending=True)
display(gdf_sorted.head())

Unnamed: 0,OBJECTID,fylkesnummer,fylkesnavn,kommunenummer,kommunenavn,delomradenummer,delomradenavn,grunnkretsnummer,grunnkretsnavn,kilde_admin,...,a_clipped,SHAPE_Length,SHAPE_Area,geometry,st_geomfromwkb(geometry),n_trees,n_bldg,n_res_bldg,n_res_bldg_near_gs,n_trees_near_bldg
8,9,42,Agder,4204,Kristiansand,420409,Lund - Sødal,42040920,Marviksletta,SSB 2023 - standard for delområde og grunnkret...,...,18909.55,751.675804,18909.562023,"POLYGON ((442609.572 6446163.086, 442569.861 6...",b'\x02\x04\xe0\x00\x00\x00\x00\x00\xb3\x14\xd8...,48,6,0,0,0
14,15,42,Agder,4204,Kristiansand,420409,Lund - Sødal,42040921,Lund Industriområde,SSB 2023 - standard for delområde og grunnkret...,...,24840.04,967.899316,24840.047288,"MULTIPOLYGON (((442713.730 6446499.812, 442782...",b'\x05\x04\xe9\x00\x00\x00\x00\x00\xcc\x14\xd8...,40,9,0,0,0
19,20,42,Agder,4204,Kristiansand,420411,Kongsgård Øvre - Gimlekollen,42041102,Kongsgard 1 - Vige,SSB 2023 - standard for delområde og grunnkret...,...,22397.41,909.288579,22397.408197,"MULTIPOLYGON (((442851.419 6447227.389, 442780...",b'\x05\x04\x8a\x00\x00\x00\x00\x00\x90.\xd8H\x...,166,9,0,0,0
30,31,42,Agder,4204,Kristiansand,420411,Kongsgård Øvre - Gimlekollen,42041104,Vestre Bjørndalen,SSB 2023 - standard for delområde og grunnkret...,...,52049.94,1294.367401,52049.9384,"MULTIPOLYGON (((441799.568 6447306.628, 441731...",b'\x05\x04\xfa\x00\x00\x00\x00\x00m\xb0\xd7H\x...,36,20,2,2,4
27,28,42,Agder,4204,Kristiansand,420409,Lund - Sødal,42040919,Gimle,SSB 2023 - standard for delområde og grunnkret...,...,184103.51,2619.449809,184103.51217,"POLYGON ((441249.940 6447082.561, 441243.394 6...","b'\x02\x04\xc4\x00\x00\x00\x00\x00Og\xd7H,\xbd...",1175,21,2,2,18


### Generate Columns with Area Statistics

| Name | Alias | Description | Type |  Unit |
| --- | --- | --- | --- | --- |
| a_district | Grunnkretsareal | Area of the district | FLOAT | m2 |
| a_open_space | Åpent område | Area of open space | FLOAT | m2 |
| a_private_space | Privat område | Area of private space | FLOAT | m2 |
| a_public_space | Offentlig område | Area of public space | FLOAT | m2 |
| a_green_space | Grøntområde | Area of green space | FLOAT | m2 |
| a_crown | Kroneareal | Crown coverage area within the district | FLOAT | m2 |
| a_crown_public | Kroneareal i offentlig område | Crown coverage area within public space | FLOAT | m2 |
| a_crown_private | Kroneareal i privat område | Crown coverage area within private space | FLOAT | m2 |
