In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_trunc
from pyspark.sql.types import *
import polars as pl
import pandas as pd

In [6]:
spark = SparkSession.builder.appName("test").getOrCreate()
spark.conf.set("google.cloud.auth.service.account.enable", "true")
spark.conf.set("google.cloud.auth.service.account.json.keyfile", "/opt/spark/credentials/google-credential.json")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/08 00:49:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
def get_spark_schema():
  from pyspark.sql.types import StructType, StructField, IntegerType, TimestampType, FloatType, DoubleType, StringType
  return StructType([
    StructField("MMSI", IntegerType(), True),
    StructField("BaseDateTime", TimestampType(), True),
    StructField("LAT", DoubleType(), True),
    StructField("LON", DoubleType(), True),
    StructField("SOG", FloatType(), True),
    StructField("COG", FloatType(), True),
    StructField("Heading", FloatType(), True),
    StructField("VesselName", StringType(), True),
    StructField("IMO", StringType(), True),
    StructField("CallSign", StringType(), True),
    StructField("VesselType", ShortType(), True),
    StructField("Status", ShortType(), True),
    StructField("Length", FloatType(), True),
    StructField("Width", FloatType(), True),
    StructField("Draft", FloatType(), True),
    StructField("Cargo", StringType(), True),
    StructField("TransceiverClass", StringType(), True)
  ])

def get_pl_schema():
  from polars import Schema, String, Datetime, Int32, Int16, Float64, Float32
  return Schema(
    {
      "MMSI":Int32(),
      "BaseDateTime":Datetime(),
      "LAT":Float64(),
      "LON":Float64(),
      "SOG":Float32(),
      "COG":Float32(),
      "Heading":Float32(),
      "VesselName":String(),
      "IMO":String(),
      "CallSign":String(),
      "VesselType":Int16(),
      "Status":Int16(),
      "Length":Float32(),
      "Width":Float32(),
      "Draft":Float32(),
      "Cargo":String(),
      "TransceiverClass":String()
    }
  )


In [None]:
def ingest_day(url:str, path:str, storage_options:dict = None):
  import requests, zipfile, io
  import polars as pl 
  res = requests.get(url=url, stream=True)
  if(res.status_code == 200):
    zip_buffer = io.BytesIO(res.content)
    with zipfile.ZipFile(zip_buffer, 'r') as zip_ref:
      csv_filename = zip_ref.namelist()[0]
      with zip_ref.open(csv_filename) as csv_file:
        pl.read_csv(csv_file, has_header=True, infer_schema=False,schema=get_pl_schema()) \
          .write_parquet(file=path, storage_options=storage_options)

def spark_read(spark, path):
    return 

In [None]:
url= "https://coast.noaa.gov/htdata/CMSP/AISDataHandler/2024/AIS_2024_01_01.zip"
path = "gs://vessel-traffic-parquet-data/raw_day/*"
storage_options = {"service_account_path":"/opt/spark/credentials/google-credential.json"}
#ingest_day(url, path, storage_options)

In [26]:
spark_df = spark.read.schema(get_spark_schema()).format("parquet").load(path)

In [27]:
spark_df.count()

                                                                                

7296275

In [28]:
spark_df.show()

[Stage 3:>                                                          (0 + 1) / 1]

+---------+-------------------+--------+----------+----+-----+-------+--------------------+----------+--------+----------+------+------+-----+-----+-----+----------------+
|     MMSI|       BaseDateTime|     LAT|       LON| SOG|  COG|Heading|          VesselName|       IMO|CallSign|VesselType|Status|Length|Width|Draft|Cargo|TransceiverClass|
+---------+-------------------+--------+----------+----+-----+-------+--------------------+----------+--------+----------+------+------+-----+-----+-----+----------------+
|338075892|2024-01-01 00:00:03|43.65322| -70.25298| 0.0|358.8|  511.0|PILOT BOAT SPRING PT|      NULL| WDB8945|        90|     0|   0.0|  0.0|  0.0|   90|               A|
|367669550|2024-01-01 00:00:04|46.20031|-123.38573| 0.0|281.9|  141.0|   ALASKA CHALLENGER|IMO7938024| WDH9586|        30|    15|  30.0|  8.0|  0.0|   30|               A|
|367118980|2024-01-01 00:00:06|29.98534| -90.40674| 0.0| 30.1|  296.0|     CAPT J A MORGAN|IMO1186680| WDD2725|        31|    12| 115.0| 34.

                                                                                

In [93]:
updated_df = spark_df.withColumn("Year-Month-Partition", date_trunc("month", col("BaseDateTime")).cast("date"))

In [94]:
updated_df.show()

+---------+-------------------+--------+----------+----+-----+-------+--------------------+----------+--------+----------+------+------+-----+-----+-----+----------------+--------------------+
|     MMSI|       BaseDateTime|     LAT|       LON| SOG|  COG|Heading|          VesselName|       IMO|CallSign|VesselType|Status|Length|Width|Draft|Cargo|TransceiverClass|Year-Month-Partition|
+---------+-------------------+--------+----------+----+-----+-------+--------------------+----------+--------+----------+------+------+-----+-----+-----+----------------+--------------------+
|338075892|2024-01-01 00:00:03|43.65322| -70.25298| 0.0|358.8|  511.0|PILOT BOAT SPRING PT|      NULL| WDB8945|        90|     0|   0.0|  0.0|  0.0|   90|               A|          2024-01-01|
|367669550|2024-01-01 00:00:04|46.20031|-123.38573| 0.0|281.9|  141.0|   ALASKA CHALLENGER|IMO7938024| WDH9586|        30|    15|  30.0|  8.0|  0.0|   30|               A|          2024-01-01|
|367118980|2024-01-01 00:00:06|29.9