In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SmartCityTrafficMonitoring") \
    .config("spark.sql.shuffle.partitions", "4") \
    .getOrCreate()

1 Data Ingestion & Schema Analysis

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType, DoubleType

spark = SparkSession.builder.appName("TrafficMonitoringSystem").getOrCreate()

traffic_df_inferred = spark.read.option("header", True).option("inferSchema", True).csv("file:/Workspace/Shared/traffic_logs.csv")

manual_schema = StructType([
    StructField("LogID", StringType(), True),
    StructField("VehicleID", StringType(), True),
    StructField("EntryPoint", StringType(), True),
    StructField("ExitPoint", StringType(), True),
    StructField("EntryTime", TimestampType(), True),
    StructField("ExitTime", TimestampType(), True),
    StructField("VehicleType", StringType(), True),
    StructField("SpeedKMH", IntegerType(), True),
    StructField("TollPaid", DoubleType(), True),
])

traffic_df = spark.read.option("header", True).schema(manual_schema).csv("file:/Workspace/Shared/traffic_logs.csv")


2 Derived Column Creation

In [0]:
from pyspark.sql.functions import col, unix_timestamp, when

#Calculate TripDurationMinutes = ExitTime - EntryTime
traffic_df = traffic_df.withColumn("TripDurationMinutes", 
    (unix_timestamp("ExitTime") - unix_timestamp("EntryTime")) / 60)
    
#Add IsOverspeed = SpeedKMH > 60
traffic_df = traffic_df.withColumn("IsOverspeed", col("SpeedKMH") > 60)

3 Vehicle Behavior Aggregations

In [0]:
from pyspark.sql.functions import avg, sum, count, desc

# Avg speed per VehicleType
traffic_df.groupBy("VehicleType").agg(avg("SpeedKMH").alias("AvgSpeed")).show()

# Total toll collected by EntryPoint
traffic_df.groupBy("EntryPoint").agg(sum("TollPaid").alias("TotalToll")).show()

# Most used ExitPoint
traffic_df.groupBy("ExitPoint").agg(count("*").alias("Count")).orderBy(desc("Count")).show(1)

+-----------+--------+
|VehicleType|AvgSpeed|
+-----------+--------+
|      Truck|    45.0|
|       Bike|    55.0|
|        Bus|    40.0|
|        Car|    70.0|
+-----------+--------+

+----------+---------+
|EntryPoint|TotalToll|
+----------+---------+
|     GateC|     50.0|
|     GateA|     80.0|
|     GateB|    170.0|
+----------+---------+

+---------+-----+
|ExitPoint|Count|
+---------+-----+
|    GateC|    2|
+---------+-----+
only showing top 1 row



4 Window Functions

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, lag

# Rank vehicles by speed within VehicleType
windowSpec = Window.partitionBy("VehicleType").orderBy(desc("SpeedKMH"))
traffic_df = traffic_df.withColumn("SpeedRank", rank().over(windowSpec))

# Find last exit time for each vehicle using lag()
windowSpec2 = Window.partitionBy("VehicleID").orderBy("EntryTime")
traffic_df = traffic_df.withColumn("LastExitTime", lag("ExitTime").over(windowSpec2))

traffic_df.select(
    "LogID", "VehicleID", "VehicleType", "SpeedKMH", "SpeedRank",
    "EntryTime", "ExitTime", "LastExitTime"
).show(truncate=False)

+-----+---------+-----------+--------+---------+-------------------+-------------------+------------+
|LogID|VehicleID|VehicleType|SpeedKMH|SpeedRank|EntryTime          |ExitTime           |LastExitTime|
+-----+---------+-----------+--------+---------+-------------------+-------------------+------------+
|L001 |V001     |Car        |60      |2        |2024-05-01 08:01:00|2024-05-01 08:20:00|NULL        |
|L002 |V002     |Truck      |45      |1        |2024-05-01 08:10:00|2024-05-01 08:45:00|NULL        |
|L003 |V003     |Bike       |55      |1        |2024-05-01 09:00:00|2024-05-01 09:18:00|NULL        |
|L004 |V004     |Car        |80      |1        |2024-05-01 09:15:00|2024-05-01 09:35:00|NULL        |
|L005 |V005     |Bus        |40      |1        |2024-05-01 10:05:00|2024-05-01 10:40:00|NULL        |
+-----+---------+-----------+--------+---------+-------------------+-------------------+------------+



5 Session Segmentation

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import unix_timestamp, col, lag

session_window = Window.partitionBy("VehicleID").orderBy("EntryTime")

traffic_df = traffic_df.withColumn("LastExitTime", lag("ExitTime").over(session_window))

traffic_df = traffic_df.withColumn("IdleMinutes", 
    (unix_timestamp("EntryTime") - unix_timestamp("LastExitTime")) / 60)

traffic_df.select(
    "VehicleID", "EntryTime", "ExitTime", "LastExitTime", "IdleMinutes"
).orderBy("VehicleID", "EntryTime").show(truncate=False)

+---------+-------------------+-------------------+------------+-----------+
|VehicleID|EntryTime          |ExitTime           |LastExitTime|IdleMinutes|
+---------+-------------------+-------------------+------------+-----------+
|V001     |2024-05-01 08:01:00|2024-05-01 08:20:00|NULL        |NULL       |
|V002     |2024-05-01 08:10:00|2024-05-01 08:45:00|NULL        |NULL       |
|V003     |2024-05-01 09:00:00|2024-05-01 09:18:00|NULL        |NULL       |
|V004     |2024-05-01 09:15:00|2024-05-01 09:35:00|NULL        |NULL       |
|V005     |2024-05-01 10:05:00|2024-05-01 10:40:00|NULL        |NULL       |
+---------+-------------------+-------------------+------------+-----------+



6 Anomaly Detection

In [0]:
#Identify vehicles with speed > 70 and TripDuration < 10 minutes
anomaly_fast_short = traffic_df.filter(
    (col("SpeedKMH") > 70) & (col("TripDurationMinutes") < 10)
)

anomaly_fast_short.select(
    "VehicleID", "SpeedKMH", "TripDurationMinutes", "EntryTime", "ExitTime"
).show(truncate=False)

#Vehicles that paid less toll for longer trips
anomaly_low_toll_long_trip = traffic_df.filter(
    (col("TripDurationMinutes") > 25) & (col("TollPaid") < 50)
)

anomaly_low_toll_long_trip.select(
    "VehicleID", "TripDurationMinutes", "TollPaid", "EntryPoint", "ExitPoint"
).show(truncate=False)

#Suspicious backtracking (ExitPoint earlier than EntryPoint)
anomaly_backtrack = traffic_df.filter(
    col("ExitPoint") < col("EntryPoint")
)

anomaly_backtrack.select(
    "VehicleID", "EntryPoint", "ExitPoint", "EntryTime", "ExitTime"
).show(truncate=False)

+---------+--------+-------------------+---------+--------+
|VehicleID|SpeedKMH|TripDurationMinutes|EntryTime|ExitTime|
+---------+--------+-------------------+---------+--------+
+---------+--------+-------------------+---------+--------+

+---------+-------------------+--------+----------+---------+
|VehicleID|TripDurationMinutes|TollPaid|EntryPoint|ExitPoint|
+---------+-------------------+--------+----------+---------+
+---------+-------------------+--------+----------+---------+

+---------+----------+---------+-------------------+-------------------+
|VehicleID|EntryPoint|ExitPoint|EntryTime          |ExitTime           |
+---------+----------+---------+-------------------+-------------------+
|V005     |GateB     |GateA    |2024-05-01 10:05:00|2024-05-01 10:40:00|
+---------+----------+---------+-------------------+-------------------+



7 Join with Metadata

In [0]:
vehicle_registry_schema = StructType([
    StructField("VehicleID", StringType(), True),
    StructField("OwnerName", StringType(), True),
    StructField("Model", StringType(), True),
    StructField("RegisteredCity", StringType(), True),
])

registry_df = spark.read.option("header", True).schema(vehicle_registry_schema).csv("file:/Workspace/Shared/vehicle_registry.csv")

# Join and group trips by RegisteredCity
enriched_df = traffic_df.join(registry_df, "VehicleID", "left")
enriched_df.groupBy("RegisteredCity").agg(count("*").alias("TotalTrips")).show()

+--------------+----------+
|RegisteredCity|TotalTrips|
+--------------+----------+
|     Bangalore|         1|
|          Pune|         1|
|         Delhi|         1|
|       Chennai|         1|
|        Mumbai|         1|
+--------------+----------+



8 Delta Lake Features

In [0]:
from delta.tables import DeltaTable

# Save traffic_logs as Delta Table
enriched_df.write.format("delta").mode("overwrite").save("/tmp/traffic_logs_delta")

delta_df = DeltaTable.forPath(spark, "/tmp/traffic_logs_delta")

# Apply MERGE INTO to update toll rates for all Bikes
delta_df.alias("target").merge(
    enriched_df.filter(col("VehicleType") == "Bike").alias("source"),
    "target.LogID = source.LogID"
).whenMatchedUpdate(set={"TollPaid": "35"}).execute()

# Delete trips longer than 60 minutes
delta_df.delete("TripDurationMinutes > 60")

# Use DESCRIBE HISTORY and VERSION AS OF
spark.sql("DESCRIBE HISTORY delta.`/tmp/traffic_logs_delta`").show()
spark.read.format("delta").option("versionAsOf", 0).load("/tmp/traffic_logs_delta").show()

+-------+-------------------+----------------+--------------------+---------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|version|          timestamp|          userId|            userName|operation| operationParameters| job|          notebook|           clusterId|readVersion|   isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+-------------------+----------------+--------------------+---------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|      3|2025-06-19 05:02:13|4028198190791787|azuser3553_mml.lo...| OPTIMIZE|{predicate -> [],...|NULL|{1052067078041120}|0611-043506-43vn1hs6|          1|SnapshotIsolation|        false|{numRemovedFiles ...|        NULL|Databricks-Runtim...|
|      2|2025-06-19 05:02:11

9 Advanced Conditions

In [0]:
from pyspark.sql.functions import when

# when/otherwise : Tag trip type
enriched_df = enriched_df.withColumn("TripType", 
    when(col("TripDurationMinutes") < 15, "Short")
    .when(col("TripDurationMinutes") <= 30, "Medium")
    .otherwise("Long"))

#Flag vehicles with more than 3 trips in a day
from pyspark.sql.functions import to_date

daily_trips = enriched_df.withColumn("TripDate", to_date("EntryTime"))
frequent_vehicles = daily_trips.groupBy("VehicleID", "TripDate").count().filter("count > 3")
frequent_vehicles.show()

+---------+--------+-----+
|VehicleID|TripDate|count|
+---------+--------+-----+
+---------+--------+-----+



10 Export & Reporting

In [0]:
# Parquet partitioned by VehicleType
enriched_df.write.partitionBy("VehicleType").mode("overwrite").parquet("output/vehicle_partitioned")

#CSV for dashboards
enriched_df.write.mode("overwrite").csv("output/traffic_summary_csv", header=True)

# Create summary SQL View: total toll by VehicleType + ExitPoint
enriched_df.createOrReplaceTempView("traffic_view")
spark.sql("""
    SELECT VehicleType, ExitPoint, SUM(TollPaid) AS TotalToll
    FROM traffic_view
    GROUP BY VehicleType, ExitPoint
""").show()


+-----------+---------+---------+
|VehicleType|ExitPoint|TotalToll|
+-----------+---------+---------+
|      Truck|    GateC|    100.0|
|        Car|    GateC|     50.0|
|        Car|    GateD|     50.0|
|       Bike|    GateD|     30.0|
|        Bus|    GateA|     70.0|
+-----------+---------+---------+

