# Pipeline zur Anreicherung der AIS Daten

## Statische Spalten zum Datensatz hinzufügen

### Imports & Pfade

In [1]:
import polars as pl
import pandas as pd
from pathlib import Path

# ============================
# Paths
# ============================
BASE_DIR = Path("/Users/jakobschneider/Machine Learning/Data_LCC")

AIS_SHIPS_DIR = BASE_DIR / "AIS_ships_2024"
MRV_PATH = BASE_DIR / "MRV_2024.xlsx"

CLEANED_YEAR_PATH = BASE_DIR / "AIS_2024_cleaned_year.parquet"
ENRICHED_YEAR_PATH = BASE_DIR / "AIS_2024_cleaned_year_enriched_shipstatic.parquet"

monthly_ship_files = sorted(AIS_SHIPS_DIR.glob("2024_NOAA_AIS_ships_*.parquet"))

print("Ship files:", len(monthly_ship_files))
print("Cleaned input:", CLEANED_YEAR_PATH)
print("Enriched output:", ENRICHED_YEAR_PATH)

Ship files: 12
Cleaned input: /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_year.parquet
Enriched output: /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_year_enriched_shipstatic.parquet


### MRV laden & IMO-Set bauen (identisch zur ersten Pipeline)

In [2]:
df_mrv_pd = pd.read_excel(MRV_PATH)

MRV_IMO_COLUMN = "IMO Number"

imos_mrv = (
    df_mrv_pd[MRV_IMO_COLUMN]
    .dropna()
    .astype("int64")
    .unique()
)

imo_mrv_set = set(imos_mrv.tolist())

print(f"MRV IMOs: {len(imo_mrv_set)}")
print("Sample:", list(imo_mrv_set)[:10])

MRV IMOs: 14024
Sample: [9797632, 9863170, 9502726, 9273351, 9109512, 9699335, 9404429, 9863182, 9404431, 9502738]


### Helper function: IMO-Checkdigit

In [3]:
def is_valid_imo_digits(imo_digits: str) -> bool:
    if imo_digits is None:
        return False
    imo_digits = str(imo_digits).strip()
    if not imo_digits.isdigit() or len(imo_digits) != 7:
        return False

    digits = [int(d) for d in imo_digits]
    weighted_sum = sum(d * w for d, w in zip(digits[:6], range(7, 1, -1)))
    return (weighted_sum % 10) == digits[6]

### Ship-static Features aus AIS_ships bauen
* wieder TransceiverClass A
* IMO bereinigen + MRV-Filter
* MMSI → genau eine IMO
* Dedup auf IMO

In [4]:
lf_ships_raw = pl.scan_parquet([str(f) for f in monthly_ship_files])

wanted_cols = [
    "MMSI",
    "IMO",
    "TransceiverClass",
    "VesselType",
    "Length",
    "Width",
    "Draft",
    "Cargo",
]

available = set(lf_ships_raw.schema.keys())
ship_cols = [c for c in wanted_cols if c in available]

lf_ships = (
    lf_ships_raw
    .select([pl.col(c) for c in ship_cols])
    .filter(pl.col("TransceiverClass") == "A")
    .with_columns(
        pl.col("MMSI").cast(pl.Int64),
        pl.col("IMO").cast(pl.Utf8).alias("IMO_str"),
    )
    .with_columns(
        pl.col("IMO_str")
        .str.strip_chars()
        .str.replace_all(r"[^0-9]", "")
        .alias("IMO_digits")
    )
    .with_columns(
        pl.col("IMO_digits")
        .map_elements(is_valid_imo_digits, return_dtype=pl.Boolean)
        .alias("IMO_valid")
    )
    .filter(pl.col("IMO_valid") == True)
    .with_columns(
        pl.col("IMO_digits").cast(pl.Int32).alias("IMO")
    )
    .drop(["IMO_str", "IMO_digits", "IMO_valid"])
    .filter(pl.col("IMO").is_in(list(imo_mrv_set)))
    .with_columns([
        pl.col("Length").cast(pl.Float32, strict=False),
        pl.col("Width").cast(pl.Float32, strict=False),
        pl.col("Draft").cast(pl.Float32, strict=False),
        pl.col("VesselType").cast(pl.Utf8, strict=False),
        pl.col("Cargo").cast(pl.Utf8, strict=False),
    ])
)

  available = set(lf_ships_raw.schema.keys())


### MMSI → IMO eindeutig + Ship-static pro IMO

In [5]:
# MMSI must map to exactly one IMO
lf_valid_mmsi = (
    lf_ships
    .group_by("MMSI")
    .agg(pl.col("IMO").n_unique().alias("n_unique_imo"))
    .filter(pl.col("n_unique_imo") == 1)
    .select("MMSI")
)

lf_ships_unamb = lf_ships.join(lf_valid_mmsi, on="MMSI", how="inner")

# Deduplicate to ONE row per IMO
lf_ship_static = (
    lf_ships_unamb
    .group_by("IMO")
    .agg([
        pl.col("VesselType").first(),
        pl.col("Length").first(),
        pl.col("Width").first(),
        pl.col("Draft").first(),
        pl.col("Cargo").first(),
    ])
)

print("Ship-static schema:")
print(lf_ship_static.schema)

print("Ship-static preview:")
print(lf_ship_static.limit(5).collect())

Ship-static schema:
Schema([('IMO', Int32), ('VesselType', String), ('Length', Float32), ('Width', Float32), ('Draft', Float32), ('Cargo', String)])
Ship-static preview:


  print(lf_ship_static.schema)


shape: (5, 6)
┌─────────┬────────────┬────────┬───────┬───────┬───────┐
│ IMO     ┆ VesselType ┆ Length ┆ Width ┆ Draft ┆ Cargo │
│ ---     ┆ ---        ┆ ---    ┆ ---   ┆ ---   ┆ ---   │
│ i32     ┆ str        ┆ f32    ┆ f32   ┆ f32   ┆ str   │
╞═════════╪════════════╪════════╪═══════╪═══════╪═══════╡
│ 9425863 ┆ 70.0       ┆ 189.0  ┆ 32.0  ┆ 10.9  ┆ 70.0  │
│ 9474723 ┆ 70.0       ┆ 229.0  ┆ 32.0  ┆ null  ┆ null  │
│ 9370381 ┆ 70.0       ┆ 180.0  ┆ 28.0  ┆ 6.1   ┆ 70.0  │
│ 9457464 ┆ 70.0       ┆ 228.0  ┆ 32.0  ┆ null  ┆ null  │
│ 9312092 ┆ 70.0       ┆ 177.0  ┆ 33.0  ┆ 9.0   ┆ 70.0  │
└─────────┴────────────┴────────┴───────┴───────┴───────┘


### Join auf gereinigte Jahresdatei

In [6]:
lf_year = pl.scan_parquet(str(CLEANED_YEAR_PATH))

lf_enriched = (
    lf_year
    .with_columns(pl.col("IMO").cast(pl.Int32))
    .join(lf_ship_static, on="IMO", how="left")
)

print("Enriched schema:")
print(lf_enriched.schema)

Enriched schema:
Schema([('IMO', Int32), ('BaseDateTime', Datetime(time_unit='us', time_zone=None)), ('LAT', Float32), ('LON', Float32), ('SOG', Float32), ('COG', Float32), ('Heading', Float32), ('Status', Int8), ('VesselType', String), ('Length', Float32), ('Width', Float32), ('Draft', Float32), ('Cargo', String)])


  print(lf_enriched.schema)


### Schreiben der enriched Jahresdatei

In [7]:
lf_enriched.sink_parquet(str(ENRICHED_YEAR_PATH))
print("Written:", ENRICHED_YEAR_PATH)

Written: /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_year_enriched_shipstatic.parquet


### Sanity Checks

In [2]:
lf = pl.scan_parquet(str(ENRICHED_YEAR_PATH))

df_cov = (
    lf.select([
        pl.len().alias("rows"),
        pl.col("VesselType").is_not_null().mean().alias("VesselType_coverage"),
        pl.col("Length").is_not_null().mean().alias("Length_coverage"),
        pl.col("Width").is_not_null().mean().alias("Width_coverage"),
        pl.col("Draft").is_not_null().mean().alias("Draft_coverage"),
    ])
    .collect()
)

print(df_cov)
print(lf.limit(10).collect())

shape: (1, 5)
┌───────────┬─────────────────────┬─────────────────┬────────────────┬────────────────┐
│ rows      ┆ VesselType_coverage ┆ Length_coverage ┆ Width_coverage ┆ Draft_coverage │
│ ---       ┆ ---                 ┆ ---             ┆ ---            ┆ ---            │
│ u32       ┆ f64                 ┆ f64             ┆ f64            ┆ f64            │
╞═══════════╪═════════════════════╪═════════════════╪════════════════╪════════════════╡
│ 972115337 ┆ 1.0                 ┆ 0.996182        ┆ 0.989276       ┆ 0.99472        │
└───────────┴─────────────────────┴─────────────────┴────────────────┴────────────────┘
shape: (10, 13)
┌─────────┬─────────────────────┬───────────┬────────────┬───┬────────┬───────┬───────┬───────┐
│ IMO     ┆ BaseDateTime        ┆ LAT       ┆ LON        ┆ … ┆ Length ┆ Width ┆ Draft ┆ Cargo │
│ ---     ┆ ---                 ┆ ---       ┆ ---        ┆   ┆ ---    ┆ ---   ┆ ---   ┆ ---   │
│ i32     ┆ datetime[μs]        ┆ f32       ┆ f32        ┆   ┆ f32

In [4]:
# ============================================================
# Dataset Overview: Columns, Rows, Ships
# ============================================================

import polars as pl
from pathlib import Path

BASE_DIR = Path("/Users/jakobschneider/Machine Learning/Data_LCC")
ENRICHED_YEAR_PATH = BASE_DIR / "AIS_2024_cleaned_year_enriched_shipstatic.parquet"

lf = pl.scan_parquet(str(ENRICHED_YEAR_PATH))

# ------------------------------------------------------------
# 1) Columns & dtypes
# ------------------------------------------------------------
print("\n=== Columns & Data Types ===")
schema_df = (
    pl.DataFrame(
        {
            "column": list(lf.schema.keys()),
            "dtype": [str(v) for v in lf.schema.values()],
        }
    )
    .sort("column")
)
print(schema_df)

# ------------------------------------------------------------
# 2) Number of rows (AIS messages)
# ------------------------------------------------------------
n_rows = lf.select(pl.len().alias("n_rows")).collect()["n_rows"][0]

# ------------------------------------------------------------
# 3) Number of unique ships (IMO)
# ------------------------------------------------------------
n_ships = (
    lf.select(pl.col("IMO").n_unique().alias("n_unique_imo"))
    .collect()["n_unique_imo"][0]
)

print("\n=== Dataset Size ===")
print(f"Number of rows (AIS messages): {n_rows:,}")
print(f"Number of unique ships (IMO): {n_ships:,}")


=== Columns & Data Types ===
shape: (13, 2)
┌──────────────┬─────────────────────────────────┐
│ column       ┆ dtype                           │
│ ---          ┆ ---                             │
│ str          ┆ str                             │
╞══════════════╪═════════════════════════════════╡
│ BaseDateTime ┆ Datetime(time_unit='us', time_… │
│ COG          ┆ Float32                         │
│ Cargo        ┆ String                          │
│ Draft        ┆ Float32                         │
│ Heading      ┆ Float32                         │
│ …            ┆ …                               │
│ Length       ┆ Float32                         │
│ SOG          ┆ Float32                         │
│ Status       ┆ Int8                            │
│ VesselType   ┆ String                          │
│ Width        ┆ Float32                         │
└──────────────┴─────────────────────────────────┘


  "column": list(lf.schema.keys()),
  "dtype": [str(v) for v in lf.schema.values()],



=== Dataset Size ===
Number of rows (AIS messages): 972,115,337
Number of unique ships (IMO): 6,998
