In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql.types import *
# import polars as pl
# import pandas as pd

In [2]:
conf = SparkConf().set('spark.ui.port', '4045')\
  .set("google.cloud.auth.service.account.enable", "true")\
  .set("google.cloud.auth.service.account.json.keyfile", "/opt/spark/credentials/google-credential.json")
spark = SparkSession.builder.appName("test").config(conf = conf).master("local[*]").getOrCreate()
# spark.conf.set("google.cloud.auth.service.account.enable", "true") 
# spark.conf.set("google.cloud.auth.service.account.json.keyfile", "/opt/spark/credentials/google-credential.json")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/14 21:22:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
def get_spark_schema():
  from pyspark.sql.types import StructType, StructField, IntegerType, TimestampType, FloatType, DoubleType, StringType
  return StructType([
    StructField("MMSI", StringType(), False),
    StructField("BaseDateTime", TimestampType(), False),
    StructField("LAT", DoubleType(), False),
    StructField("LON", DoubleType(), False),
    StructField("SOG", FloatType(), False),
    StructField("COG", FloatType(), False),
    StructField("Heading", FloatType(), True),
    StructField("VesselName", StringType(), True),
    StructField("IMO", StringType(), True),
    StructField("CallSign", StringType(), True),
    StructField("VesselType", ShortType(), True),
    StructField("Status", ShortType(), True),
    StructField("Length", FloatType(), True),
    StructField("Width", FloatType(), True),
    StructField("Draft", FloatType(), True),
    StructField("Cargo", StringType(), True),
    StructField("TransceiverClass", StringType(), False)
  ])

In [4]:
gcs_path = "gs://vessel-traffic-parquet-data/"
spark_df = spark.read.schema(get_spark_schema()).format("parquet").load(gcs_path + "raw_day")
vessel_profile_df = spark_df.select("MMSI", "VesselName", "IMO", "CallSign", "VesselType", "Length", "Width").distinct()
ais_df = spark_df.select("MMSI","BaseDateTime","LAT","LON","SOG","COG","Heading","Status","Draft","Cargo","TransceiverClass")

In [5]:
spark_df.show()
# cols_with_nulls = [x for x in spark_df.columns if spark_df.filter(f.col(x).isNull()).count() > 0]
# cols_with_emptystring = [x for x in spark_df.columns if spark_df.filter(f.col(x).isNull()).count() > 0]

[Stage 0:>                                                          (0 + 1) / 1]

+---------+-------------------+--------+----------+----+-----+-------+--------------------+----------+--------+----------+------+------+-----+-----+-----+----------------+
|     MMSI|       BaseDateTime|     LAT|       LON| SOG|  COG|Heading|          VesselName|       IMO|CallSign|VesselType|Status|Length|Width|Draft|Cargo|TransceiverClass|
+---------+-------------------+--------+----------+----+-----+-------+--------------------+----------+--------+----------+------+------+-----+-----+-----+----------------+
|338075892|2024-01-01 00:00:03|43.65322| -70.25298| 0.0|358.8|  511.0|PILOT BOAT SPRING PT|      NULL| WDB8945|        90|     0|   0.0|  0.0|  0.0|   90|               A|
|367669550|2024-01-01 00:00:04|46.20031|-123.38573| 0.0|281.9|  141.0|   ALASKA CHALLENGER|IMO7938024| WDH9586|        30|    15|  30.0|  8.0|  0.0|   30|               A|
|367118980|2024-01-01 00:00:06|29.98534| -90.40674| 0.0| 30.1|  296.0|     CAPT J A MORGAN|IMO1186680| WDD2725|        31|    12| 115.0| 34.

                                                                                

In [6]:
spark_df.count()

                                                                                

221958517

In [7]:
#documentation regarding "invalid/not accessable/default" values on:
#https://www.navcen.uscg.gov/ais-class-a-reports

#replace values for "invalid/not accessable/default" to Null for non-categorial field 
vessel_profile_df = vessel_profile_df.replace("IMO0000000", None, "IMO")
vessel_profile_df = vessel_profile_df.replace(0, None, ["Length", "Width"])
ais_df = ais_df.replace(511.0, None, "Heading")
ais_df = ais_df.replace(102.3, None, "SOG")
ais_df = ais_df.replace(360, None, "COG")
ais_df = ais_df.replace(0, None, "Draft")

#replace null to encoded "invalid/not accessable/default" values for categorial field
vessel_profile_df = vessel_profile_df.fillna(0, "VesselType")
ais_df = ais_df.fillna(15, "Status")
ais_df = ais_df.fillna(0, "Cargo")


In [10]:
ais_df.show()

[Stage 19:>                                                         (0 + 1) / 1]

+---------+-------------------+--------+----------+----+-----+-------+------+-----+-----+----------------+
|     MMSI|       BaseDateTime|     LAT|       LON| SOG|  COG|Heading|Status|Draft|Cargo|TransceiverClass|
+---------+-------------------+--------+----------+----+-----+-------+------+-----+-----+----------------+
|338075892|2024-01-01 00:00:03|43.65322| -70.25298| 0.0|358.8|   NULL|     0| NULL|   90|               A|
|367669550|2024-01-01 00:00:04|46.20031|-123.38573| 0.0|281.9|  141.0|    15| NULL|   30|               A|
|367118980|2024-01-01 00:00:06|29.98534| -90.40674| 0.0| 30.1|  296.0|    12|  3.0|   57|               A|
|367177840|2024-01-01 00:00:05|39.88654| -75.17649| 0.0|304.4|   NULL|    15| NULL|   52|               A|
|367305420|2024-01-01 00:00:06|18.33273| -64.95229| 0.0|332.6|   NULL|     0| NULL|   52|               A|
|338239081|2024-01-01 00:00:05|38.95731|  -76.4841| 0.1|111.3|   NULL|    15| NULL| NULL|               B|
|367507960|2024-01-01 00:00:02|33.753

                                                                                

In [11]:
vessel_profile_df.show()



+---------+-----------------+----------+--------+----------+------+-----+
|     MMSI|       VesselName|       IMO|CallSign|VesselType|Length|Width|
+---------+-----------------+----------+--------+----------+------+-----+
|367083580|         ROCKFISH|IMO8998954| WDC8223|        52|  24.0| 11.0|
|367338000|     HARVEY POWER|IMO9654232|    KVEY|        70|  92.0| 20.0|
|368128050|            ROYAL|IMO2855411| WDL4095|        60|  14.0|  5.0|
|368293190|ISABELLA JULIETTE|IMO1326320| WDN5797|        52|  24.0| 10.0|
|366982180|     C-TRACTOR 10|IMO8875463| WCO3210|        52|  25.0| 10.0|
|368339870|            KIAKI|      NULL| WDP2829|        37|  14.0|  4.0|
|538009654|    CLIPPER CLYDE|IMO9455911| V7A5002|        70| 177.0| 27.0|
|367385380|          VICTORY|      NULL| WDH8434|        37|  30.0|  7.0|
|368020980|       ASSATEAGUE|IMO9816892| WDJ8905|        31| 145.0| 17.0|
|367181290|             NYAD|      NULL| WDL8733|        36|  16.0|  5.0|
|477890100|  ZIM MOUNT BLANC|IMO993111

                                                                                

In [32]:
vessel_profile_df.filter(f.expr("VesselName like '@'")).show()



+----+----------+---+--------+----------+------+-----+
|MMSI|VesselName|IMO|CallSign|VesselType|Length|Width|
+----+----------+---+--------+----------+------+-----+
+----+----------+---+--------+----------+------+-----+





In [8]:
if vessel_profile_df.count() != vessel_profile_df.select("MMSI").distinct().count():
  vessel_profile_df.groupBy("MMSI").count().filter(f.expr("count > 1")).sort(f.desc("count")).show()

                                                                                

In [34]:
vessel_profile_df.write.parquet(gcs_path + "/test/vessel_profile")

                                                                                

In [35]:
ais_df.write.parquet(gcs_path + "/test/ais_data")

                                                                                