## Polygon based filtration
Here I have drawn polygons around specified terminals to reduce data further

In [None]:
import pandas as pd
from shapely.geometry import Point, Polygon
from pathlib import Path

# ── CONFIGURATION ──
CHUNK_SIZE = 900_000
IN_DIR     = Path(r"D:\Thesis Work MLS\Denmark AIS data\processed_files")
OUT_DIR    = IN_DIR / "ports_filtered_flat"
OUT_DIR.mkdir(exist_ok=True)

# ── DEFINE YOUR PORT POLYGONS (lon, lat) ──
port_polygons = {
    "Aarhaus": Polygon([
        (10.2177641, 56.1759993),
        (10.2017995, 56.1264703),
        (10.2697775, 56.143784),
        (10.2610227, 56.1722725),
        (10.2177641, 56.1759993),
    ]),
    "Copenhagen": Polygon([
        (12.5763894, 55.73307),
        (12.5889207, 55.6851003),
        (12.6522637, 55.6895516),
        (12.6393891, 55.736646),
        (12.5763894, 55.73307),
    ]),
    "Esbjerg": Polygon([
        (8.4107708, 55.4876855),
        (8.3848499, 55.4803903),
        (8.3996128, 55.4458413),
        (8.446133,   55.4358116),
        (8.4797786,  55.4554792),
        (8.4107708,  55.4876855),
    ]),
    "Fredericia": Polygon([
        (9.7232345, 55.5522225),
        (9.7699264, 55.5505717),
        (9.7761062, 55.5716379),
        (9.7566227, 55.5717835),
        (9.723492,  55.5525623),
        (9.7232345, 55.5522225),
    ]),
    "Aalborg": Polygon([
        (10.0493519, 57.070164),
        (10.0181096, 57.0567239),
        (10.069608,  57.0298291),
        (10.1053135, 57.0477611),
        (10.0493519, 57.070164),
    ]),
    "Kalundborg": Polygon([
        (11.0498775, 55.6901745),
        (11.0126249, 55.6667502),
        (11.0850684, 55.65203),
        (11.1192311, 55.6719783),
        (11.0498775, 55.6901745),
    ]),
    "Aabenraa": Polygon([
        (9.4196162, 55.0475791),
        (9.421762,  55.0197381),
        (9.4497428, 55.0209682),
        (9.4421897, 55.0472349),
        (9.4196162, 55.0475791),
    ]),
}

# ── PRECOMPUTE BOUNDING BOXES FOR FAST PREFILTER ──
port_bounds = {name: poly.bounds for name, poly in port_polygons.items()}

# ── SET UP OUTPUT WRITERS & SUMMARY COUNTERS ──
writers = {}
summary = {name: 0 for name in port_polygons}

for port in port_polygons:
    out_csv = OUT_DIR / f"{port}.csv"
    writers[port] = {"path": out_csv, "first": True}

# ── PROCESS EACH CSV IN CHUNKS ──
print(f"Starting spatial filter on {len(list(IN_DIR.glob('*.csv')))} files…")
for src in sorted(IN_DIR.glob("*.csv")):
    print(f" ▶ Reading {src.name}")
    for chunk in pd.read_csv(
        src,
        chunksize=CHUNK_SIZE,
        low_memory=False  # IMPORTANT: do NOT parse dates, just leave as strings
    ):
        # drop rows missing coords or Ship type
        chunk = chunk.dropna(subset=["Latitude", "Longitude", "Ship type"])

        # Remove all rows where Ship type is 'Undefined' (case-insensitive, handles whitespace)
        chunk = chunk[chunk["Ship type"].str.strip().str.lower() != 'undefined']

        # For each port: bounding-box then polygon test
        for port, poly in port_polygons.items():
            minx, miny, maxx, maxy = port_bounds[port]
            bb_mask = (
                (chunk["Longitude"] >= minx) &
                (chunk["Longitude"] <= maxx) &
                (chunk["Latitude"]  >= miny) &
                (chunk["Latitude"]  <= maxy)
            )
            candidate = chunk[bb_mask]
            if candidate.empty:
                continue

            # precise geometry containment
            mask = candidate.apply(
                lambda r: poly.contains(Point(r["Longitude"], r["Latitude"])),
                axis=1
            )
            filtered = candidate[mask]
            if filtered.empty:
                continue

            # write out ALL columns for rows inside this port
            w = writers[port]
            if w["first"]:
                filtered.to_csv(w["path"], index=False, mode='w')
                w["first"] = False
            else:
                filtered.to_csv(w["path"], index=False, header=False, mode='a')

            summary[port] += len(filtered)

    print(f"   Done {src.name}")

# ── FINAL SUMMARY ──
print("\nRows retained per port:")
for port, cnt in summary.items():
    print(f" • {port}: {cnt:,} rows → {writers[port]['path']}")
