In [1]:
import duckdb

In [2]:
con = duckdb.connect("test.db", read_only=True)
con.execute("SELECT 42").fetchall()  ## or con.sql("SELECT 42")

[(42,)]

In [3]:
con.execute("INSTALL spatial") ## or con.install_extension("spatial")
con.execute("LOAD spatial") ## or con.load_extension("spatial")

<duckdb.duckdb.DuckDBPyConnection at 0x233277029f0>

In [None]:
con.sql("SELECT * FROM ST_Read('cities.geojson')")

In [None]:
con.sql("SELECT COUNT(*) FROM 'cities*.csv'")

In [None]:
# con.sql().write_parquet() ## or con.sql("COPY (SELECT 42) TO 'out.parquet'")

con.sql(
    "COPY (SELECT * FROM cities WHERE country='USA') TO 'cities_us.parquet' (FORMAT PARQUET)"
)

In [None]:
con.close()

## *DuckDB* solution to modify

In [None]:
CREATE TABLE points AS SELECT * FROM read_parquet('points.parquet');
CREATE TABLE boundary AS SELECT * FROM read_parquet('boundary.parquet');

In [None]:
ALTER TABLE points ADD COLUMN geom_webmercator GEOMETRY;
UPDATE points SET geom_webmercator = ST_Transform(geom, 3857);

ALTER TABLE boundary ADD COLUMN geom_webmercator GEOMETRY;
UPDATE boundary SET geom_webmercator = ST_Transform(geom, 3857);


In [None]:
SELECT
  p.id,
  ST_Distance(p.geom_webmercator, b.geom_webmercator) AS distance_meters
FROM points p
CROSS JOIN boundary b;


## *GDAL* solution (via *Shapely*) to modify

In [None]:
import fiona
from shapely.geometry import shape

with fiona.open("boundary.shp") as src:
    boundary_geom = shape(next(iter(src))['geometry'])

In [None]:
import pandas as pd
from shapely.geometry import Point

def compute_distance_to_boundary(df_chunk, boundary_geom):
    return df_chunk.apply(
        lambda row: Point(row['lon'], row['lat']).distance(boundary_geom),
        axis=1
    )

# Assuming point data is in CSV
for chunk in pd.read_csv("points.csv", chunksize=1000000):
    chunk['distance'] = compute_distance_to_boundary(chunk, boundary_geom)
    chunk.to_csv("distances.csv", mode='a', header=False, index=False)
