In [1]:
import ConnectionConfigKaloyan as cc
from delta import DeltaTable
cc.setupEnvironment()

In [2]:
spark = cc.startLocalCluster("JSON_PREP")
spark.getActiveSession()

In [3]:
cc.set_connectionProfile("default")
ride_src_df = spark.read \
    .format("jdbc") \
    .option("url", cc.create_jdbc()) \
    .option("driver" , cc.get_Property("driver")) \
    .option("dbtable","rides").option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "rideid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 1000) \
    .load()

In [4]:
selected_rides_df = ride_src_df.select("rideid", "starttime", "endtime","startpoint", "endpoint","vehicleid")
selected_rides_df.show(5)

+------+-------------------+-------------------+-----------------+-----------------+---------+
|rideid|          starttime|            endtime|       startpoint|         endpoint|vehicleid|
+------+-------------------+-------------------+-----------------+-----------------+---------+
|     1|2015-09-22 00:00:00|2012-09-22 00:00:00|(51.2083,4.44595)|(51.1938,4.40228)|      844|
|     2|2015-09-22 00:00:00|2012-09-22 00:00:00|(51.2174,4.41597)|(51.2188,4.40935)|     4545|
|     3|2015-09-22 00:00:00|2012-09-22 00:00:00|(51.2088,4.40834)|(51.2077,4.39846)|     3419|
|     4|2015-09-22 00:00:00|2012-09-22 00:00:00|(51.2023,4.41208)|(51.2119,4.39894)|     1208|
|     5|2015-09-22 00:00:00|2012-09-22 00:00:00|(51.1888,4.45039)|(51.2221,4.40467)|     5536|
+------+-------------------+-------------------+-----------------+-----------------+---------+
only showing top 5 rows



In [5]:
# Filter by time interval (e.g., first week of July 2023)
from pyspark.sql.functions import col

filtered_rides_df = selected_rides_df.filter(
    (col("starttime") >= "2023-07-01") & (col("starttime") < "2023-07-07")
)

In [6]:
# Load vehicle info
vehicle_df = spark.read \
    .format("jdbc") \
    .option("url", cc.create_jdbc()) \
    .option("driver" , cc.get_Property("driver")) \
    .option("dbtable","vehicles").option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "vehicleid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 1000) \
    .load()

In [7]:
# Join rides with vehicle info
rides_with_vehicle = filtered_rides_df.join(vehicle_df, on="vehicleid", how="left")

In [9]:
# Step 3: Nest vehicle fields using struct
from pyspark.sql.functions import col, struct
rides_nested = rides_with_vehicle.select(
    col("rideid"),
    col("starttime"),
    col("endtime"),
    col("startpoint"),
    col("endpoint"),
    struct(
        col("vehicleid"),
        col("serialnumber"),
        col("lastmaintenanceon"),
        col("position")
    ).alias("vehicle")
)

In [10]:
# Write to JSON for MongoDB import
rides_nested.write.mode("overwrite").json("rides_json_output")