In [1]:
import pandas as pd
from pathlib import Path
from datetime import timedelta

# ── CONFIGURATION ──
TIME_WINDOW_MINUTES = 3
TIME_DELTA = timedelta(minutes=TIME_WINDOW_MINUTES)
CHUNK_SIZE = 500_000
MIN_LENGTH = 70  # drop any vessel shorter than this

INPUT_DIR = Path('.')  # folder with your port CSVs
OUTPUT_DIR = INPUT_DIR / f'thinned_{TIME_WINDOW_MINUTES}min_records'
OUTPUT_DIR.mkdir(exist_ok=True)

for src_path in sorted(INPUT_DIR.glob("*.csv")):
    print(f"\n▶ Processing {src_path.name} (window = {TIME_WINDOW_MINUTES} min, min length = {MIN_LENGTH})")
    out_path = OUTPUT_DIR / src_path.name

    last_kept = {}  # mmsi → Timestamp of last kept row
    first_write = True

    total_rows = 0
    kept_rows = 0
    bad_timestamps = 0

    reader = pd.read_csv(src_path, chunksize=CHUNK_SIZE, low_memory=False)
    for chunk in reader:
        total_rows += len(chunk)

        # 1) Parse your ISO‐8601 UTC timestamps robustly
        chunk['date_time_utc'] = pd.to_datetime(
            chunk['date_time_utc'],
            utc=True,         # honors the “+00:00” offset
            errors='coerce'   # convert any bad strings to NaT
        )

        # 2) Count & drop malformed timestamps or coords
        n_bad = chunk['date_time_utc'].isna().sum()
        if n_bad:
            bad_timestamps += int(n_bad)
        chunk = chunk.dropna(subset=['date_time_utc', 'latitude', 'longitude'])

        # 2b) DROP VESSELS BELOW MIN_LENGTH
        # ensure length is numeric
        chunk['length'] = pd.to_numeric(chunk['length'], errors='coerce')
        chunk = chunk[chunk['length'] >= MIN_LENGTH]

        if chunk.empty:
            continue

        # 3) Sort by time to guarantee chronological order per MMSI
        chunk.sort_values('date_time_utc', inplace=True)

        # 4) Thinning loop
        kept = []
        for row in chunk.itertuples(index=False):
            ts   = row.date_time_utc
            mmsi = row.mmsi
            last = last_kept.get(mmsi)

            # Keep if first or at least TIME_DELTA since last keep
            if last is None or ts >= last + TIME_DELTA:
                kept.append(row._asdict())
                last_kept[mmsi] = ts

        if not kept:
            continue

        df_keep = pd.DataFrame(kept)
        kept_rows += len(df_keep)

        # 5) Write or append
        if first_write:
            df_keep.to_csv(out_path, index=False)
            first_write = False
        else:
            df_keep.to_csv(out_path, index=False, header=False, mode='a')

    print(f"   Rows read: {total_rows:,}")
    print(f"   Rows malformed-dropped (bad ts/coords): {bad_timestamps:,}")
    print(f"   Rows after thinning & length-filter: {kept_rows:,}")

print(f"\n✅ Finished. Thinned files in: {OUTPUT_DIR.resolve()}")



▶ Processing Bergen.csv (window = 3 min, min length = 70)
   Rows read: 3,086,168
   Rows malformed-dropped (bad ts/coords): 0
   Rows after thinning & length-filter: 266,073

▶ Processing Drammen.csv (window = 3 min, min length = 70)
   Rows read: 285,673
   Rows malformed-dropped (bad ts/coords): 0
   Rows after thinning & length-filter: 79,910

▶ Processing Haugesund.csv (window = 3 min, min length = 70)
   Rows read: 1,303,646
   Rows malformed-dropped (bad ts/coords): 0
   Rows after thinning & length-filter: 126,700

▶ Processing Kristiansand.csv (window = 3 min, min length = 70)
   Rows read: 331,660
   Rows malformed-dropped (bad ts/coords): 0
   Rows after thinning & length-filter: 80,983

▶ Processing Larvik.csv (window = 3 min, min length = 70)
   Rows read: 114,568
   Rows malformed-dropped (bad ts/coords): 0
   Rows after thinning & length-filter: 25,454

▶ Processing Moss.csv (window = 3 min, min length = 70)
   Rows read: 380,127
   Rows malformed-dropped (bad ts/coords