In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, expr, to_timestamp, unix_timestamp, to_date, concat, lit, format_string, when, from_utc_timestamp
from pyspark.sql.types import *
import math
from pyspark.sql.functions import udf

In [2]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("GTFSRealtimeVPProcessing") \
    .master("local[*]") \
    .config("spark.sql.session.timeZone", "America/New_York") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/21 09:55:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/08/21 09:55:35 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/08/21 09:55:35 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/08/21 09:55:35 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [3]:
# # Schema of vehicle position data
# vehicle_schema = StructType([
#     StructField("header", StructType([
#         StructField("gtfsRealtimeVersion", StringType()),
#         StructField("timestamp", StringType())
#     ])),
#     StructField("entity", ArrayType(StructType([
#         StructField("id", StringType()),
#         StructField("vehicle", StructType([
#             StructField("trip", StructType([
#                 StructField("tripId", StringType()),
#                 StructField("routeId", StringType()),
#                 StructField("startDate", StringType())
#             ])),
#             StructField("position", StructType([
#                 StructField("latitude", DoubleType()),
#                 StructField("longitude", DoubleType())
#             ])),
#             StructField("timestamp", StringType())
#         ]))
#     ])))
# ])







vehicle_schema = StructType([
    StructField("header", StructType([
        StructField("gtfsRealtimeVersion", StringType())
    ])),
    StructField("entity", ArrayType(StructType([
        StructField("id", StringType()),
        StructField("vehicle", StructType([
            StructField("trip", StructType([
                StructField("tripId", StringType()),
                StructField("routeId", StringType()),
                StructField("startDate", StringType())
            ])),
            StructField("position", StructType([
                StructField("latitude", DoubleType()),
                StructField("longitude", DoubleType())
            ])),
            StructField("timestamp", StringType())
        ]))
    ])))
])

In [4]:
# ClickHouse connection details
clickhouse_url = "jdbc:clickhouse://clickhouse:8123"
clickhouse_properties = {
    "user": "default",
    "password": "123",
    "driver": "com.clickhouse.jdbc.ClickHouseDriver",
    "isolationLevel": "NONE"
}

In [5]:
# Read and process vehicle positions from Kafka
vp_raw_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "broker:29092") \
    .option("subscribe", "gtfs-vehicle-positions") \
    .option("startingOffsets", "earliest") \
    .load()

In [6]:
# vp_kafka_df = vp_raw_df.selectExpr("CAST(value AS STRING) AS json_str","topic")
# vp_df = vp_kafka_df.select(from_json(col("json_str"), vehicle_schema).alias("data")) \
#     .select("data.*") \
#     .withColumn("header_timestamp", to_timestamp(col("header.timestamp").cast("long"))) \
#     .withColumn("vehicle_timestamp", to_timestamp(col("entity.vehicle.timestamp").cast("long"))) \
#     .withWatermark("header_timestamp", "2 minutes")



vp_kafka_df = vp_raw_df.selectExpr("CAST(value AS STRING) AS json_str")
vp_df = vp_kafka_df.select(from_json(col("json_str"), vehicle_schema).alias("data")) \
    .select(
        col("data.header.gtfsRealtimeVersion").alias("gtfs_version"),
        expr("explode(data.entity) as entity")
    ) \
    .withColumn(
        "vehicle_timestamp",
        when(
            col("entity.vehicle.timestamp").cast("long").isNotNull(),
            to_timestamp(col("entity.vehicle.timestamp").cast("long"))
        ).otherwise(lit(None))
    ) \
    .withWatermark("vehicle_timestamp", "2 minutes")



In [7]:
# vp_exploded_df = vp_df.select(
#     col("header.gtfsRealtimeVersion").alias("gtfs_version"),
#     col("header_timestamp"),
#     expr("explode(entity) as entity"),
#     col("kafka_timestamp")
# ).select(
#     col("gtfs_version"),
#     col("header_timestamp"),
#     col("entity.id").alias("entity_id"),
#     col("entity.vehicle.trip.tripId").alias("vp_trip_id"),
#     col("entity.vehicle.trip.routeId").alias("route_id"),
#     col("entity.vehicle.trip.startDate").alias("vp_start_date"),
#     col("entity.vehicle.position.latitude").alias("latitude"),
#     col("entity.vehicle.position.longitude").alias("longitude"),
#     col("entity.vehicle.timestamp").alias("vehicle_timestamp"),
#     col("kafka_timestamp")
# ).filter(col("vp_trip_id").isNotNull())



vp_exploded_df = vp_df.select(
    col("gtfs_version"),
    col("entity.id").alias("entity_id"),
    col("entity.vehicle.trip.tripId").alias("vp_trip_id"),
    col("entity.vehicle.trip.routeId").alias("route_id"),
    col("entity.vehicle.trip.startDate").alias("vp_start_date"),
    col("entity.vehicle.position.latitude").alias("latitude"),
    col("entity.vehicle.position.longitude").alias("longitude"),
    col("vehicle_timestamp")
).filter(col("vp_trip_id").isNotNull())





In [8]:
# Write vehicle positions to ClickHouse
def write_vp_to_clickhouse(batch_df, batch_id):
    try:
        batch_df.write \
            .format("jdbc") \
            .option("url", f"{clickhouse_url}/gtfs_streaming") \
            .option("dbtable", "vehicle_positions2") \
            .option("user", clickhouse_properties["user"]) \
            .option("password", clickhouse_properties["password"]) \
            .option("driver", clickhouse_properties["driver"]) \
            .mode("append") \
            .save()
    except Exception as e:
        print(f"Vehicle Positions write error: {e}")

In [10]:
vp_query = vp_exploded_df.writeStream \
    .outputMode("append") \
    .foreachBatch(write_vp_to_clickhouse) \
    .trigger(processingTime="30 seconds") \
    .option("checkpointLocation", "check_points/vehicle_position_checks") \
    .start()

25/08/21 09:55:44 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/08/21 09:55:45 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
