Convert a geojson file from openaddresses to a parquet.

This is a run-once step so that then we can load this as some sample data.

In [None]:
import gzip
from pathlib import Path

import ibis
from ibis import _
from ibis.backends.duckdb import Backend as DuckDBBackend

In [None]:
p = Path("alaska-addresses.geojson")
zip_path = Path("./alaska-addresses.geojson.gz")
if not p.exists():
    with gzip.open(zip_path, "rb") as f_in:
        with open(p, "wb") as f_out:
            f_out.write(f_in.read())

conn: DuckDBBackend = ibis.duckdb.connect()
addresses = conn.read_geo(p)
addresses = addresses.cache()
print(addresses.count())
addresses

In [None]:
a = addresses.select(
    state=_.region.nullif(""),
    # district=_.district.nullif(""),  # always null?
    # id=_.id.nullif(""),  # always null?
    city=_.city.nullif(""),
    postcode=_.postcode.nullif(""),
    street=(_.number.fill_null("") + " " + _.street.fill_null("")).strip().nullif(""),
    unit=_.unit.nullif(""),
    # hash=("0x" + _.hash).cast("uint64"),  # don't need this
    lon=_.geom.x(),
    lat=_.geom.y(),
)
a

In [None]:
a.group_by("state", "city", "postcode", "street", "unit").agg(
    n=_.count(),
    lat_var=_.lat.var(),
    lon_var=_.lon.var(),
    lat_min=_.lat.min(),
    lat_max=_.lat.max(),
    lon_min=_.lon.min(),
    lon_max=_.lon.max(),
).order_by(
    _.lat_var.desc(),
    # _.n.desc(),
)

In [None]:
# yuck, based on the above high variance, we can see that the data is not clean.
# Just pick the first one for each group.
a = a.group_by(
    "state",
    "city",
    "postcode",
    "street",
    "unit",
).agg(
    lon=_.lon.first(order_by=_.lat),
    lat=_.lat.first(order_by=_.lat),
)

In [None]:
a = (
    a.order_by(
        # order from changing-least-quickly to changing-most-quickly
        # so that we are most likely to be able to get long runs of the same value
        # to compress well
        "state",
        "city",
        "postcode",
        "street",
        "unit",
    )
    .mutate(
        record_id=ibis.row_number(),
    )
    .relocate("record_id")
)

In [None]:
a.to_parquet(
    "./alaska-addresses-clean.parquet",
    compression="zstd",
    COMPRESSION_LEVEL=15,
)