In [2]:
import polars as pl

# ---------- 0. Patterns anpassen ----------
LOGS_PATTERN = '/Users/jakobschneider/Machine Learning/Data_LCC/2024_NOAA_AIS_logs_*.parquet'
SHIPS_PATTERN = '/Users/jakobschneider/Machine Learning/Data_LCC/2024_NOAA_AIS_ships_*.parquet'

OUTPUT_YEAR = '/Users/jakobschneider/Machine Learning/Data_LCC/Data_merged.parquet'


# ---------- 1. Lazy scans ----------
logs_all = pl.scan_parquet(LOGS_PATTERN)
ships_all = pl.scan_parquet(SHIPS_PATTERN)


# ---------- 2. Cast BaseDateTime to Datetime ----------
logs_all = logs_all.with_columns(
    pl.col("BaseDateTime").cast(pl.Datetime)
)

ships_all = (
    ships_all
    .with_columns(
        pl.col("BaseDateTime").cast(pl.Datetime)
    )
    .with_columns(
        pl.col("BaseDateTime").alias("ShipMetadataTimestamp")
    )
    .drop("BaseDateTime")
)


# ---------- 3. Drop ALL rows in logs with ANY missing values ----------
# This removes rows where ANY column (MMSI, LAT, LON, SOG, COG, Heading, Status, BaseDateTime)
# is null or NaN.
logs_clean = logs_all.drop_nulls()


# ---------- 4. Ships: reduce columns ----------
ships_reduced = ships_all.select([
    "MMSI",
    "VesselName",
    "IMO",
    "CallSign",
    "VesselType",
    "Length",
    "Width",
    "Draft",
    "Cargo",
    "TransceiverClass",
    "ShipMetadataTimestamp",
]).unique("MMSI", keep="last")  # ensure 1x per MMSI


# ---------- 5. Join logs + ships via MMSI ----------
logs_enriched = (
    logs_clean.join(
        ships_reduced,
        on="MMSI",
        how="left"
    )
)


# ---------- 6. Optional relevant ship filter ----------
# You can comment this out if you want the full dataset.
relevant_vessel_types = [70, 71, 72, 73, 74, 75, 76]  # example only

logs_filtered = (
    logs_enriched
    .filter(
        (pl.col("TransceiverClass") == "A") &
        (pl.col("Draft") > 0) &
        (pl.col("VesselType").is_in(relevant_vessel_types))
    )
)


# ---------- 7. Write final annual parquet ----------
logs_filtered.sink_parquet(OUTPUT_YEAR)

print(f"\nWritten full enriched 2024 dataset to:\n  {OUTPUT_YEAR}\n")


Written full enriched 2024 dataset to:
  /Users/jakobschneider/Machine Learning/Data_LCC/Data_merged.parquet



Neue Jahresdatei nur mit Transceiverklasse A

In [1]:
import polars as pl

PATH_YEAR = '/Users/jakobschneider/Machine Learning/Data_LCC/Data_merged.parquet'

PATH_YEAR_A = "/Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_enriched_classA.parquet"

lf = pl.scan_parquet(PATH_YEAR)

# Nur TransceiverClass A behalten
lf_class_a = lf.filter(pl.col("TransceiverClass") == "A")

# Direkt als neue Parquet-Datei schreiben (Lazy → kein vollständiges collect im RAM)
lf_class_a.sink_parquet(PATH_YEAR_A)

print(f"Gefilterte Datei geschrieben nach: {PATH_YEAR_A}")

Gefilterte Datei geschrieben nach: /Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_enriched_classA.parquet


In [2]:
import polars as pl

PATH_YEAR_A = "/Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_enriched_classA.parquet"
lf_a = pl.scan_parquet(PATH_YEAR_A)

# Zeilenanzahl (ohne alles zu laden)
row_count = lf_a.select(pl.count()).collect()[0, 0]
print("Rows (class A):", row_count)

# Anzahl Spalten
print("Columns:", len(lf_a.schema))

# Distinct IMO / MMSI
print(
    lf_a.select([
        pl.col("IMO").n_unique().alias("unique_imo"),
        pl.col("MMSI").n_unique().alias("unique_mmsi"),
    ]).collect()
)

# Ein paar Zeilen anschauen (das ist safe)
print(lf_a.limit(20).collect())

(Deprecated in version 0.20.5)
  row_count = lf_a.select(pl.count()).collect()[0, 0]
  print("Columns:", len(lf_a.schema))


Rows (class A): 225298554
Columns: 18
shape: (1, 2)
┌────────────┬─────────────┐
│ unique_imo ┆ unique_mmsi │
│ ---        ┆ ---         │
│ u32        ┆ u32         │
╞════════════╪═════════════╡
│ 8491       ┆ 8693        │
└────────────┴─────────────┘
shape: (20, 18)
┌───────────┬──────────────┬──────────┬────────────┬───┬───────┬───────┬─────────────┬─────────────┐
│ MMSI      ┆ BaseDateTime ┆ LAT      ┆ LON        ┆ … ┆ Draft ┆ Cargo ┆ Transceiver ┆ ShipMetadat │
│ ---       ┆ ---          ┆ ---      ┆ ---        ┆   ┆ ---   ┆ ---   ┆ Class       ┆ aTimestamp  │
│ i64       ┆ datetime[μs] ┆ f64      ┆ f64        ┆   ┆ f64   ┆ f64   ┆ ---         ┆ ---         │
│           ┆              ┆          ┆            ┆   ┆       ┆       ┆ str         ┆ datetime[μs │
│           ┆              ┆          ┆            ┆   ┆       ┆       ┆             ┆ ]           │
╞═══════════╪══════════════╪══════════╪════════════╪═══╪═══════╪═══════╪═════════════╪═════════════╡
│ 314510000 ┆ 2024-01-

In [3]:
# Drop MMSI
lf = pl.scan_parquet("/Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_enriched_classA.parquet")

lf_no_mmsi = lf.drop("MMSI")

lf_no_mmsi.sink_parquet("/Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_enriched_classA_noMMSI.parquet")

Zielvariable hinzufügen

In [5]:
import polars as pl
import pandas as pd

# ----------------- Paths (adjust if needed) -----------------
AIS_PATH = "/Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_enriched_classA_noMMSI.parquet"
MRV_PATH = "/Users/jakobschneider/Machine Learning/Data_LCC/MRV_2024.xlsx"
OUTPUT_PATH = "/Users/jakobschneider/Machine Learning/Data_LCC/AIS_2024_classA_MRV.parquet"

# Name of the MRV emissions column (adjust if the name differs slightly)
MRV_EMISS_COL = "Total CO₂eq emissions [m tonnes]"  # column BG in Excel


# ----------------- 1) Load AIS (lazy) -----------------
# AIS is large, so we use scan_parquet to avoid loading everything into RAM at once
ais_lf = pl.scan_parquet(AIS_PATH)


# ----------------- 2) Load MRV Excel via pandas -----------------
# MRV is small (~35k rows), pandas in memory is fine
mrv_pd = pd.read_excel(MRV_PATH)

# Optional: print columns once to verify the exact column names
print("MRV columns:", list(mrv_pd.columns))

# Keep only IMO and emissions column
mrv_pd = mrv_pd[["IMO", MRV_EMISS_COL]].copy()

# ----------------- 3) Convert MRV to Polars -----------------
mrv = pl.from_pandas(mrv_pd)

# Normalize column types: ensure IMO is same type in both datasets
# We cast IMO to Utf8 in both to be safe.
mrv = mrv.with_columns(
    pl.col("IMO").cast(pl.Utf8)
).rename({MRV_EMISS_COL: "Total_CO2eq_mt"})  # easier column name

ais_lf = ais_lf.with_columns(
    pl.col("IMO").cast(pl.Utf8)
)


# ----------------- 4) Join AIS + MRV on IMO -----------------
# Left join: keep all AIS rows, add MRV emissions where available
ais_with_mrv = ais_lf.join(
    mrv,
    on="IMO",
    how="left"
)


# ----------------- 5) Write result as Parquet -----------------
# This triggers the lazy computation but streams the result to disk
ais_with_mrv.sink_parquet(OUTPUT_PATH)

print(f"Written AIS + MRV merged dataset to:\n  {OUTPUT_PATH}")

  warn("Workbook contains no default style, apply openpyxl's default")


MRV columns: ['Ship', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Company', 'Unnamed: 9', 'DoC', 'Unnamed: 11', 'Verifier', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Monitoring methods', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22', 'Annual monitoring results', 'Unnamed: 24', 'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29', 'Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34', 'Unnamed: 35', 'Unnamed: 36', 'Unnamed: 37', 'Unnamed: 38', 'Unnamed: 39', 'Unnamed: 40', 'Unnamed: 41', 'Unnamed: 42', 'Unnamed: 43', 'Unnamed: 44', 'Unnamed: 45', 'Unnamed: 46', 'Unnamed: 47', 'Unnamed: 48', 'Unnamed: 49', 'Unnamed: 50', 'Unnamed: 51', 'Unnamed: 52', 'Unnamed: 53', 'Unnamed: 54', 'Unnamed: 55', 'Unnamed: 56', 'Unnamed: 57', 'Unnamed: 58', 'Unnamed: 59', 'Unnamed: 60', 'Unnamed: 61', 'Unnamed: 62', 'Unnamed: 63', 'Unnamed: 64', 'Unnamed: 65', 'Unna

KeyError: "None of [Index(['IMO', 'Total CO₂eq emissions [m tonnes]'], dtype='object')] are in the [columns]"