In [13]:
import polars as pl
from pathlib import Path

# Base directory for your data
BASE_DIR = Path("/Users/jakobschneider/Machine Learning/Data_LCC")

# Directories for AIS logs and AIS ships
AIS_LOGS_DIR  = BASE_DIR / "AIS_logs_2024"
AIS_SHIPS_DIR = BASE_DIR / "AIS_ships_2024"

# Output directories / files
MONTHLY_OUTPUT_DIR = BASE_DIR / "AIS_2024_cleaned_monthly"
YEARLY_OUTPUT_PATH = BASE_DIR / "AIS_2024_cleaned_year.parquet"

MONTHLY_OUTPUT_DIR.mkdir(exist_ok=True)

# List all monthly AIS log and ship files (NOAA naming)
monthly_log_files = sorted(AIS_LOGS_DIR.glob("2024_NOAA_AIS_logs_*.parquet"))
monthly_ship_files = sorted(AIS_SHIPS_DIR.glob("2024_NOAA_AIS_ships_*.parquet"))

print("Monthly AIS log files:")
for f in monthly_log_files:
    print(" -", f)

print("\nMonthly AIS ship files:")
for f in monthly_ship_files:
    print(" -", f)

print("\nMonthly output directory:", MONTHLY_OUTPUT_DIR)
print("Yearly output file:", YEARLY_OUTPUT_PATH)

Monthly AIS log files:
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_logs_2024/2024_NOAA_AIS_logs_01.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_logs_2024/2024_NOAA_AIS_logs_02.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_logs_2024/2024_NOAA_AIS_logs_03.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_logs_2024/2024_NOAA_AIS_logs_04.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_logs_2024/2024_NOAA_AIS_logs_05.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_logs_2024/2024_NOAA_AIS_logs_06.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_logs_2024/2024_NOAA_AIS_logs_07.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_logs_2024/2024_NOAA_AIS_logs_08.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_logs_2024/2024_NOAA_AIS_logs_09.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_logs_2024/2024_NOAA_AIS_logs_10.parquet
 - /Users/jakobschneider/Ma

In [14]:
# Helper function to check IMO validity (7 digits + check digit)
def is_valid_imo_digits(imo_digits: str) -> bool:
    """
    Validate IMO number using check digit.
    Input is expected as a string of exactly 7 digits.
    """
    if imo_digits is None:
        return False
    imo_digits = str(imo_digits).strip()
    if not imo_digits.isdigit():
        return False
    if len(imo_digits) != 7:
        return False

    digits = [int(d) for d in imo_digits]
    # First 6 digits * weights 7..2, last digit is check digit
    weighted_sum = sum(d * w for d, w in zip(digits[:6], range(7, 1, -1)))
    check_digit = weighted_sum % 10
    return check_digit == digits[6]

In [15]:
# Load all monthly AIS ship files as one lazy frame
lf_ships = pl.scan_parquet([str(f) for f in monthly_ship_files])

# Select relevant columns and prepare MMSI/IMO
lf_ships = (
    lf_ships
    .select(
        [
            pl.col("MMSI"),
            pl.col("IMO"),
            pl.col("VesselType"),
            # add more columns here if needed, e.g. Length, Width, Draft, ...
        ]
    )
    .with_columns(
        # Ensure MMSI is consistent type (Int64) for later join
        pl.col("MMSI").cast(pl.Int64).alias("MMSI"),
        # Clean IMO: convert to string, strip whitespace, remove non-digits (e.g. remove 'IMO' prefix)
        pl.col("IMO")
        .cast(pl.Utf8)
        .str.strip_chars()
        .str.replace_all(r"[^0-9]", "")  # keep only digits
        .alias("IMO_digits")
    )
)

# Quick sanity check on IMO formats
lf_ships.select(
    [
        pl.col("IMO").head(5).alias("raw_IMO_sample"),
        pl.col("IMO_digits").head(5).alias("digits_IMO_sample"),
    ]
).collect()

raw_IMO_sample,digits_IMO_sample
str,str
"""<Unknown>""",""""""
"""IMO0000000""","""0000000"""
"""<Unknown>""",""""""
"""IMO0000007""","""0000007"""
"""IMO0000001""","""0000001"""


In [16]:
# Apply IMO digit-based validation
lf_ships_clean = (
    lf_ships
    .with_columns(
        pl.col("IMO_digits").map_elements(
            is_valid_imo_digits,
            return_dtype=pl.Boolean,
        ).alias("IMO_valid")
    )
    # Keep only valid IMOs (you can temporarily comment this out to see effect)
    .filter(pl.col("IMO_valid") == True)
    .with_columns(
        pl.col("IMO_digits").cast(pl.Int64).alias("IMO_int")
    )
    .drop(["IMO", "IMO_digits", "IMO_valid"])
    .rename({"IMO_int": "IMO"})
)

# Optional: check how many ships remain
lf_ships_clean.select(pl.count().alias("n_rows")).collect()
lf_ships_clean.limit(5).collect()

(Deprecated in version 0.20.5)
  lf_ships_clean.select(pl.count().alias("n_rows")).collect()


MMSI,VesselType,IMO
i64,f64,i64
1,30.0,0
4061,33.0,0
1234567,1.0,0
3381234,36.0,0
103669999,37.0,0


In [None]:
# Ensure consistent dtypes and remove rows without IMO
lf_ships_clean = (
    lf_ships_clean
    .with_columns([
        pl.col("MMSI").cast(pl.Int64).alias("MMSI"),
        pl.col("IMO").cast(pl.Int64).alias("IMO"),
    ])
    .filter(pl.col("IMO").is_not_null())
)

# Some MMSI might (erroneously) map to multiple IMO numbers.
# We drop only those MMSI to keep a consistent mapping:
# - multiple MMSI -> same IMO = OK
# - one MMSI -> multiple IMO = NOT OK

lf_mmsi_imo_counts = (
    lf_ships_clean
    .group_by("MMSI")
    .agg(
        pl.col("IMO").n_unique().alias("n_unique_imo")
    )
)

# Keep only MMSI that map to exactly one IMO
lf_valid_mmsi = (
    lf_mmsi_imo_counts
    .filter(pl.col("n_unique_imo") == 1)
    .select("MMSI")
)

# Filter ships table to those "good" MMSI
lf_ships_unambiguous = (
    lf_ships_clean
    .join(
        lf_valid_mmsi,
        on="MMSI",
        how="inner"
    )
)

# Optional: how many ships (rows) remain and first few examples
lf_ships_unambiguous.select(pl.count().alias("n_rows")).collect()
lf_ships_unambiguous.limit(5).collect()

(Deprecated in version 0.20.5)
  lf_ships_unambiguous.select(pl.count().alias("n_rows")).collect()


MMSI,VesselType,IMO
i64,f64,i64
1,30.0,0
4061,33.0,0
1234567,1.0,0
3381234,36.0,0
103669999,37.0,0


In [18]:
# Collect unambiguous ship mapping into memory
df_ships_unambiguous = lf_ships_unambiguous.collect()

print("Unambiguous ship mapping shape:", df_ships_unambiguous.shape)
df_ships_unambiguous.head()

Unambiguous ship mapping shape: (358944, 3)


MMSI,VesselType,IMO
i64,f64,i64
1,30.0,0
4061,33.0,0
1234567,1.0,0
3381234,36.0,0
103669999,37.0,0


In [20]:
for log_file in monthly_log_files:
    print(f"Processing {log_file.name} ...")

    # Lazy scan of the monthly log file
    lf_log = (
        pl.scan_parquet(str(log_file))
        .select(
            [
                pl.col("MMSI"),
                pl.col("BaseDateTime"),
                pl.col("LAT"),
                pl.col("LON"),
                pl.col("SOG"),
                pl.col("COG"),
                pl.col("Heading"),
                pl.col("Status"),
            ]
        )
        .with_columns(
            # Ensure MMSI uses same type as ships mapping
            pl.col("MMSI").cast(pl.Int64).alias("MMSI")
        )
    )

    # Join with ship mapping (df_ships_unambiguous -> lazy)
    lf_joined = (
        lf_log
        .join(
            df_ships_unambiguous.lazy(),
            on="MMSI",
            how="inner"
        )
        # Keep only entries with valid IMO (should be redundant, but safe)
        .filter(pl.col("IMO").is_not_null())
        # Set correct dtypes
        .with_columns([
            pl.col("LAT").cast(pl.Float64),
            pl.col("LON").cast(pl.Float64),
            pl.col("SOG").cast(pl.Float64),
            pl.col("COG").cast(pl.Float64),
            pl.col("Heading").cast(pl.Float64),
            pl.col("BaseDateTime").cast(pl.Datetime),

            # Navigation status as integer code (0–15)
            pl.col("Status").cast(pl.Int64),

            # VesselType as categorical is okay – typically integer codes
            pl.col("VesselType").cast(pl.Categorical),
        ])
        # Outlier filters
        .filter((pl.col("LAT") >= -90.0) & (pl.col("LAT") <= 90.0))
        .filter((pl.col("LON") >= -180.0) & (pl.col("LON") <= 180.0))
        .filter(
            pl.col("SOG").is_null()
            | ((pl.col("SOG") >= 0.0) & (pl.col("SOG") <= 50.0))
        )
        .filter(
            pl.col("COG").is_null()
            | ((pl.col("COG") >= 0.0) & (pl.col("COG") <= 360.0))
        )
        .filter(
            pl.col("Heading").is_null()
            | ((pl.col("Heading") >= 0.0) & (pl.col("Heading") <= 360.0))
        )
        # Keep only core columns
        .select(
            [
                "MMSI",
                "IMO",
                "BaseDateTime",
                "LAT",
                "LON",
                "SOG",
                "COG",
                "Heading",
                "Status",
                "VesselType",
            ]
        )
    )

    out_file = MONTHLY_OUTPUT_DIR / f"{log_file.stem}_cleaned.parquet"
    lf_joined.sink_parquet(str(out_file))

    print(f" -> written: {out_file}")

Processing 2024_NOAA_AIS_logs_01.parquet ...
 -> written: /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_01_cleaned.parquet
Processing 2024_NOAA_AIS_logs_02.parquet ...
 -> written: /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_02_cleaned.parquet
Processing 2024_NOAA_AIS_logs_03.parquet ...
 -> written: /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_03_cleaned.parquet
Processing 2024_NOAA_AIS_logs_04.parquet ...
 -> written: /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_04_cleaned.parquet
Processing 2024_NOAA_AIS_logs_05.parquet ...
 -> written: /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_05_cleaned.parquet
Processing 2024_NOAA_AIS_logs_06.parquet ...
 -> written: /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_06_cl

### Status und VesselType OneHot-Encoding

In [1]:
import polars as pl
from pathlib import Path

BASE_DIR = Path("/Users/jakobschneider/Machine Learning/Data_LCC")
MONTHLY_OUTPUT_DIR = BASE_DIR / "AIS_2024_cleaned_monthly"

# All cleaned monthly files
cleaned_files = sorted(MONTHLY_OUTPUT_DIR.glob("2024_NOAA_AIS_logs_*_cleaned.parquet"))
print("Found cleaned monthly files:")
for f in cleaned_files:
    print(" -", f)

# Create a lazy frame over all cleaned monthly files
lf_all = pl.scan_parquet([str(f) for f in cleaned_files])

# Value counts for Status
status_counts = (
    lf_all
    .group_by("Status")
    .agg(pl.count().alias("n_rows"))
    .sort("n_rows", descending=True)
    .collect()
)

print("\n=== Status value counts ===")
print(status_counts)

# Value counts for VesselType
vessel_counts = (
    lf_all
    .group_by("VesselType")
    .agg(pl.count().alias("n_rows"))
    .sort("n_rows", descending=True)
    .collect()
)

print("\n=== VesselType value counts ===")
print(vessel_counts)

Found cleaned monthly files:
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_01_cleaned.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_02_cleaned.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_03_cleaned.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_04_cleaned.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_05_cleaned.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_06_cleaned.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_07_cleaned.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_cleaned_monthly/2024_NOAA_AIS_logs_08_cleaned.parquet
 - /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024

(Deprecated in version 0.20.5)
  .agg(pl.count().alias("n_rows"))


: 