# Preprocessing Pipeline

### Codezelle 1 – Imports, Pfade, Dateilisten, MRV laden

In [1]:
import polars as pl
import pandas as pd
from pathlib import Path

# Base directory for your data (adjust to your setup)
BASE_DIR = Path("/Users/jakobschneider/Machine Learning/Data_LCC")

# Directories for AIS logs and AIS ships
AIS_LOGS_DIR  = BASE_DIR / "AIS_logs_2024"
AIS_SHIPS_DIR = BASE_DIR / "AIS_ships_2024"

# MRV Excel file for 2024 (adjust if needed)
MRV_PATH = BASE_DIR / "MRV_2024.xlsx"

# Output directory for cleaned monthly AIS data
MONTHLY_OUTPUT_DIR = BASE_DIR / "AIS_2024_cleaned_monthly"
MONTHLY_OUTPUT_DIR.mkdir(exist_ok=True)

# List all monthly AIS log and ship files (NOAA naming)
monthly_log_files = sorted(AIS_LOGS_DIR.glob("2024_NOAA_AIS_logs_*.parquet"))
monthly_ship_files = sorted(AIS_SHIPS_DIR.glob("2024_NOAA_AIS_ships_*.parquet"))

print("Monthly AIS log files:")
for f in monthly_log_files:
    print(" -", f)

print("\nMonthly AIS ship files:")
for f in monthly_ship_files:
    print(" -", f)

print("\nMonthly output directory:", MONTHLY_OUTPUT_DIR)
print("MRV file:", MRV_PATH)

# ---- Load MRV and build IMO set ----

# Load MRV 2024 Excel with pandas
df_mrv_pd = pd.read_excel(MRV_PATH)

print("\nMRV columns:", df_mrv_pd.columns)

# Adjust the column name here if needed
MRV_IMO_COLUMN = "IMO Number"

# Extract unique IMOs from MRV as int64
imos_mrv = (
    df_mrv_pd[MRV_IMO_COLUMN]
    .dropna()
    .astype("int64")
    .unique()
)

imo_mrv_set = set(imos_mrv.tolist())

print(f"Number of unique IMOs in MRV: {len(imo_mrv_set)}")
print("Sample IMOs from MRV:", list(imo_mrv_set)[:10])

Monthly AIS log files:
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_logs_2024/2024_NOAA_AIS_logs_01.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_logs_2024/2024_NOAA_AIS_logs_02.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_logs_2024/2024_NOAA_AIS_logs_03.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_logs_2024/2024_NOAA_AIS_logs_04.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_logs_2024/2024_NOAA_AIS_logs_05.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_logs_2024/2024_NOAA_AIS_logs_06.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_logs_2024/2024_NOAA_AIS_logs_07.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_logs_2024/2024_NOAA_AIS_logs_08.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_logs_2024/2024_NOAA_AIS_logs_09.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_logs_2024/2024_NOAA_AIS_logs_10.parquet
 - /Users/jakobschneider/Ma

### Codezelle 2 – Helper für IMO-Checkdigit

In [2]:
# Helper function to check IMO validity (7 digits + check digit)
def is_valid_imo_digits(imo_digits: str) -> bool:
    """
    Validate IMO number using check digit.
    Input is expected as a string of exactly 7 digits.
    """
    if imo_digits is None:
        return False
    imo_digits = str(imo_digits).strip()
    if not imo_digits.isdigit():
        return False
    if len(imo_digits) != 7:
        return False

    digits = [int(d) for d in imo_digits]
    # First 6 digits * weights 7..2, last digit is check digit
    weighted_sum = sum(d * w for d, w in zip(digits[:6], range(7, 1, -1)))
    check_digit = weighted_sum % 10
    return check_digit == digits[6]

### Codezelle 3 – AIS_ships laden, IMO bereinigen, auf MRV-IMOs filtern.
* Mit TransceiverClass A Filter

In [3]:
# Load all monthly AIS ship files as one lazy frame
lf_ships_raw = pl.scan_parquet([str(f) for f in monthly_ship_files])

# Clean IMO, filter MRV IMOs, keep only TransceiverClass A
lf_ships_clean = (
    lf_ships_raw
    .select(
        [
            pl.col("MMSI"),
            pl.col("IMO"),
            pl.col("TransceiverClass"),    # <-- wichtig für den Filter
        ]
    )
    # --- NEU: Filter auf TransceiverClass A ---
    .filter(pl.col("TransceiverClass") == "A")
    
    .with_columns(
        # Ensure MMSI is Int64
        pl.col("MMSI").cast(pl.Int64).alias("MMSI"),
        # Ensure IMO is treated as string before any string operations
        pl.col("IMO")
        .cast(pl.Utf8)
        .alias("IMO_str")
    )
    .with_columns(
        # Remove non-digits from IMO, e.g. "IMO1234567" -> "1234567"
        pl.col("IMO_str")
        .str.strip_chars()
        .str.replace_all(r"[^0-9]", "")
        .alias("IMO_digits")
    )
    .with_columns(
        # Validate IMO check digit
        pl.col("IMO_digits").map_elements(
            is_valid_imo_digits,
            return_dtype=pl.Boolean,
        ).alias("IMO_valid")
    )
    .filter(pl.col("IMO_valid") == True)
    .with_columns(
        # Cast cleaned digits to Int32
        pl.col("IMO_digits").cast(pl.Int32).alias("IMO")
    )
    .drop(["IMO_str", "IMO_digits", "IMO_valid"])
    # Keep only IMOs that exist in MRV
    .filter(pl.col("IMO").is_in(list(imo_mrv_set)))
)

### Codezelle 4 – MMSI → IMO eindeutig machen (ein MMSI, genau eine IMO)

In [4]:
# Some MMSI might (erroneously) map to multiple IMO numbers.
# We drop MMSI that map to more than one IMO.
# Multiple MMSI mapping to the same IMO is allowed.

lf_mmsi_imo_counts = (
    lf_ships_clean
    .group_by("MMSI")
    .agg(
        pl.col("IMO").n_unique().alias("n_unique_imo")
    )
)

# Keep only MMSI that map to exactly one IMO
lf_valid_mmsi = (
    lf_mmsi_imo_counts
    .filter(pl.col("n_unique_imo") == 1)
    .select("MMSI")
)

# Filter ships table to those "good" MMSI
lf_ships_unambiguous = (
    lf_ships_clean
    .join(
        lf_valid_mmsi,
        on="MMSI",
        how="inner"
    )
)

# Materialize the mapping (small table) for later joins with logs
df_ships_unambiguous = lf_ships_unambiguous.collect()

print("Unambiguous ship mapping shape:", df_ships_unambiguous.shape)
df_ships_unambiguous.head()

Unambiguous ship mapping shape: (30400, 3)


MMSI,IMO,TransceiverClass
i64,i32,str
205681000,9659139,"""A"""
205691000,9687502,"""A"""
205736000,9719290,"""A"""
209017000,9416719,"""A"""
209109000,9426881,"""A"""


### Codezelle 5 – Monatsweise AIS_logs bereinigen & mit Ships mappen

In [5]:
for log_file in monthly_log_files:
    print(f"Processing {log_file.name} ...")

    # Lazy scan of the monthly log file
    lf_log = (
        pl.scan_parquet(str(log_file))
        .select(
            [
                pl.col("MMSI"),
                pl.col("BaseDateTime"),
                pl.col("LAT"),
                pl.col("LON"),
                pl.col("SOG"),
                pl.col("COG"),
                pl.col("Heading"),
                pl.col("Status"),
            ]
        )
        .with_columns(
            # Ensure MMSI uses same type as ships mapping
            pl.col("MMSI").cast(pl.Int64).alias("MMSI")
        )
    )

    # Join with ship mapping (df_ships_unambiguous -> lazy)
    lf_joined = (
    lf_log
    .join(
        df_ships_unambiguous.lazy(),
        on="MMSI",
        how="inner"
    )
    .filter(pl.col("IMO").is_not_null())
    .with_columns([
        pl.col("LAT").cast(pl.Float32),
        pl.col("LON").cast(pl.Float32),
        pl.col("SOG").cast(pl.Float32),
        pl.col("COG").cast(pl.Float32),
        pl.col("Heading").cast(pl.Float32),
        pl.col("BaseDateTime").cast(pl.Datetime),
        pl.col("Status").cast(pl.Int8),
    ])
    # outlier filters etc. bleiben
    .filter((pl.col("LAT") >= -90.0) & (pl.col("LAT") <= 90.0))
    .filter((pl.col("LON") >= -180.0) & (pl.col("LON") <= 180.0))
    .filter(
        pl.col("SOG").is_null()
        | ((pl.col("SOG") >= 0.0) & (pl.col("SOG") <= 50.0))
    )
    .filter(
        pl.col("COG").is_null()
        | ((pl.col("COG") >= 0.0) & (pl.col("COG") <= 360.0))
    )
    .filter(
        pl.col("Heading").is_null()
        | ((pl.col("Heading") >= 0.0) & (pl.col("Heading") <= 360.0))
    )
    # final columns
    .select(
        [
            "IMO",
            "BaseDateTime",
            "LAT",
            "LON",
            "SOG",
            "COG",
            "Heading",
            "Status",
        ]
    )
)

    # Write cleaned monthly parquet
    out_file = MONTHLY_OUTPUT_DIR / f"{log_file.stem}_cleaned.parquet"
    lf_joined.sink_parquet(str(out_file))

    print(f" -> written: {out_file}")

Processing 2024_NOAA_AIS_logs_01.parquet ...
 -> written: /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_01_cleaned.parquet
Processing 2024_NOAA_AIS_logs_02.parquet ...
 -> written: /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_02_cleaned.parquet
Processing 2024_NOAA_AIS_logs_03.parquet ...
 -> written: /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_03_cleaned.parquet
Processing 2024_NOAA_AIS_logs_04.parquet ...
 -> written: /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_04_cleaned.parquet
Processing 2024_NOAA_AIS_logs_05.parquet ...
 -> written: /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_05_cleaned.parquet
Processing 2024_NOAA_AIS_logs_06.parquet ...
 -> written: /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_06_cl

### Codezelle 6 – Monatsdateien → Jahresdatei

In [6]:
import polars as pl
from pathlib import Path

BASE_DIR = Path("/Users/jakobschneider/Machine Learning/Data_LCC")
MONTHLY_OUTPUT_DIR = BASE_DIR / "AIS_2024_cleaned_monthly"

# Pfad für die Jahresdatei
YEARLY_OUTPUT_PATH = BASE_DIR / "AIS_2024_cleaned_year.parquet"

# Alle monatlichen cleaned-Dateien einsammeln
cleaned_files = sorted(MONTHLY_OUTPUT_DIR.glob("2024_NOAA_AIS_logs_*_cleaned.parquet"))

print("Found cleaned monthly files:")
for f in cleaned_files:
    print(" -", f)

# LazyFrame über alle Monatsdateien
lf_year = pl.scan_parquet([str(f) for f in cleaned_files])

# Optional: check schema einmal
print("\nSchema of combined lazy frame:")
print(lf_year.schema)

# Speicherschonend als eine Jahresdatei schreiben
lf_year.sink_parquet(str(YEARLY_OUTPUT_PATH))

print("\nYearly cleaned AIS written to:", YEARLY_OUTPUT_PATH)

Found cleaned monthly files:
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_01_cleaned.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_02_cleaned.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_03_cleaned.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_04_cleaned.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_05_cleaned.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_06_cleaned.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_07_cleaned.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_08_cleaned.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024

  print(lf_year.schema)



Yearly cleaned AIS written to: /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_year.parquet


Jahresdatei ist 6GB groß

### Codezelle 7 - Jahresdatei prüfen

In [6]:
import polars as pl
from pathlib import Path

BASE_DIR = Path("/Users/jakobschneider/Machine Learning/Data_LCC")
YEARLY_OUTPUT_PATH = BASE_DIR / "AIS_2024_cleaned_year.parquet"

lf_year = pl.scan_parquet(str(YEARLY_OUTPUT_PATH))
df_preview = lf_year.limit(60000).collect()
print(df_preview)

shape: (60_000, 8)
┌─────────┬─────────────────────┬───────────┬─────────────┬─────┬────────────┬─────────┬────────┐
│ IMO     ┆ BaseDateTime        ┆ LAT       ┆ LON         ┆ SOG ┆ COG        ┆ Heading ┆ Status │
│ ---     ┆ ---                 ┆ ---       ┆ ---         ┆ --- ┆ ---        ┆ ---     ┆ ---    │
│ i32     ┆ datetime[μs]        ┆ f32       ┆ f32         ┆ f32 ┆ f32        ┆ f32     ┆ i8     │
╞═════════╪═════════════════════╪═══════════╪═════════════╪═════╪════════════╪═════════╪════════╡
│ 9620994 ┆ 2024-01-01 00:00:00 ┆ 38.042961 ┆ -122.132118 ┆ 0.0 ┆ 238.800003 ┆ 234.0   ┆ 5      │
│ 9620994 ┆ 2024-01-01 00:00:00 ┆ 38.042961 ┆ -122.132118 ┆ 0.0 ┆ 238.800003 ┆ 234.0   ┆ 5      │
│ 9620994 ┆ 2024-01-01 00:00:00 ┆ 38.042961 ┆ -122.132118 ┆ 0.0 ┆ 238.800003 ┆ 234.0   ┆ 5      │
│ 9620994 ┆ 2024-01-01 00:00:00 ┆ 38.042961 ┆ -122.132118 ┆ 0.0 ┆ 238.800003 ┆ 234.0   ┆ 5      │
│ 9620994 ┆ 2024-01-01 00:00:00 ┆ 38.042961 ┆ -122.132118 ┆ 0.0 ┆ 238.800003 ┆ 234.0   ┆ 5      │
│