In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.window import Window 
import pyspark.sql.functions as F
from sedona.spark import *
import argparse

In [2]:
conf = SparkConf().set('spark.ui.port', '4045')\
  .set("google.cloud.auth.service.account.enable", "true")\
  .set("google.cloud.auth.service.account.json.keyfile", "/opt/spark/credentials/google-credential.json")\
  .set("spark.serializer", KryoSerializer.getName)\
  .set("spark.kryo.registrator", SedonaKryoRegistrator.getName)
spark = SparkSession.builder.appName("test").config(conf = conf).master("local[*]").getOrCreate()
SedonaContext.create(spark)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/24 22:52:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
def get_spark_schema():
  from pyspark.sql.types import StructType, StructField, IntegerType, TimestampType, FloatType, DoubleType, StringType
  return StructType([
    StructField("MMSI", StringType(), False),
    StructField("BaseDateTime", TimestampType(), False),
    StructField("LAT", DoubleType(), False),
    StructField("LON", DoubleType(), False),
  ])
  
def get_port_schema():
  from pyspark.sql.types import StructType, StructField, IntegerType, TimestampType, FloatType, DoubleType, StringType
  return StructType([
    StructField("UNLOCODE", StringType(), False),
    StructField("NAME", StringType(), False),
    StructField("STATE", StringType(), False),
    StructField("LAT", DoubleType(), False),
    StructField("LON", DoubleType(), False),
  ])

In [35]:
ais_df = spark.read.parquet("gs://vessel-traffic-parquet-data/ais_data/year=2024/month=1/part-00165-bf9b1b3a-8f7b-4472-905a-079a799f95ae.c000.snappy.parquet")
port_df = spark.read.csv("gs://vessel-traffic-parquet-data/code/ports.csv", header=True)

In [36]:
sedona_df = ais_df.select("MMSI", "BaseDateTime", ST_Point(F.col("LON"), F.col("LAT")).alias("coord")).alias("ais")
port_sedona = port_df.select("UNLOCODE", 
                             ST_Point(F.col("LON"), F.col("LAT")).alias("coord"), 
                             ).alias("port")

In [37]:
join_df = sedona_df.join(F.broadcast(port_sedona), ST_Contains(ST_Buffer(F.col("port.coord"), 3500, True),F.col("ais.coord")))
join_df.cache()

25/08/23 05:56:58 WARN CacheManager: Asked to cache already cached data.


DataFrame[MMSI: string, BaseDateTime: timestamp, coord: udt, UNLOCODE: string, coord: udt]

In [38]:
windowSpec = Window.partitionBy(F.col("MMSI"), F.col("BaseDateTime")).orderBy(F.asc(F.col("d")))
join_df = join_df.select("MMSI", "BaseDateTime", "UNLOCODE", ST_DistanceSpheroid(F.col("ais.coord"), F.col("port.coord")).alias("d"))\
      .select("MMSI", "BaseDateTime", "UNLOCODE", F.row_number().over(windowSpec).alias("r")).filter(F.col("r") == 1)\
      .select("MMSI", "BaseDateTime", "UNLOCODE").alias("join")

In [43]:
final_df = sedona_df.join(join_df, 
                           (F.col("ais.MMSI") == F.col("join.MMSI")) & (F.col("ais.BaseDateTime") == F.col("join.BaseDateTime")),
                           "leftouter"
                           )
final_df.select("ais.MMSI", "ais.BaseDateTime", "join.UNLOCODE").groupBy("UNLOCODE").agg(F.count("*").alias("c")).sort(F.desc("c")).show()



+--------+-------+
|UNLOCODE|      c|
+--------+-------+
|    NULL|4844318|
|   USSEA|  71897|
|   USLGB|  26871|
|   USBTR|  24890|
|   USPEF|  24518|
|   USTPA|  21369|
|   USTIW|  20048|
|   USLU8|  19040|
|   USCRP|  18711|
|   USMSY|  15117|
|   USHOU|  13077|
|   USRCH|   9009|
|   USNYC|   7614|
|   USHR3|   4858|
|   USBPT|   4658|
|   USPDX|    638|
+--------+-------+



                                                                                

In [3]:
test_df = spark.read.parquet("gs://vessel-traffic-parquet-data/time_analysis_data/year=2024/month=3/part-00000-a985d02e-8192-4133-938f-2dbaf1e87257.c000.snappy.parquet")

                                                                                

In [None]:
test_df.show()