In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Set-1").getOrCreate()
spark

**Data Ingestion & Schema Analysis**

In [2]:
#Load CSV using PySpark with schema inference
from google.colab import drive
drive.mount('/content/drive')

df_traffic = spark.read.csv('/content/drive/MyDrive/traffic_logs.csv',header = True,inferSchema = True)
df_traffic.show()

Mounted at /content/drive
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+
|LogID|VehicleID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+
| L001|     V001|     GateA|    GateC|2024-05-01 08:01:00|2024-05-01 08:20:00|        Car|      60|      50|
| L002|     V002|     GateB|    GateC|2024-05-01 08:10:00|2024-05-01 08:45:00|      Truck|      45|     100|
| L003|     V003|     GateA|    GateD|2024-05-01 09:00:00|2024-05-01 09:18:00|       Bike|      55|      30|
| L004|     V004|     GateC|    GateD|2024-05-01 09:15:00|2024-05-01 09:35:00|        Car|      80|      50|
| L005|     V005|     GateB|    GateA|2024-05-01 10:05:00|2024-05-01 10:40:00|        Bus|      40|      70|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+---

In [3]:
# Manually define schema and compare
from pyspark.sql.types import *

schema = StructType([
    StructField("LogID",StringType(), True),
    StructField("VehicleID",StringType(), True),
    StructField("EntryPoint",StringType(), True),
    StructField("ExitPoint",StringType(), True),
    StructField("EntryTime",TimestampType(), True),
    StructField("ExitTime",TimestampType(), True),
    StructField("VehicleType",StringType(), True),
    StructField("SpeedKMH",DoubleType(),True),
    StructField("TollPaid",DoubleType(),True)
])

df_traffic = spark.read.csv('/content/drive/MyDrive/traffic_logs.csv',header = True, schema = schema)
df_traffic.printSchema()

root
 |-- LogID: string (nullable = true)
 |-- VehicleID: string (nullable = true)
 |-- EntryPoint: string (nullable = true)
 |-- ExitPoint: string (nullable = true)
 |-- EntryTime: timestamp (nullable = true)
 |-- ExitTime: timestamp (nullable = true)
 |-- VehicleType: string (nullable = true)
 |-- SpeedKMH: double (nullable = true)
 |-- TollPaid: double (nullable = true)



In [5]:
# Ensure EntryTime/ExitTime are timestamp
from pyspark.sql.functions import col,to_timestamp

df_traffic = df_traffic.withColumn("EntryPoint",to_timestamp(col("EntryPoint"),"yyyy-mm-dd HH:mm:ss")).\
withColumn("ExitPoint",to_timestamp(col("ExitPoint"),"yyyy-mm-dd HH:mm:ss"))
df_traffic.show()

+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+
|LogID|VehicleID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+
| L001|     V001|      NULL|     NULL|2024-05-01 08:01:00|2024-05-01 08:20:00|        Car|    60.0|    50.0|
| L002|     V002|      NULL|     NULL|2024-05-01 08:10:00|2024-05-01 08:45:00|      Truck|    45.0|   100.0|
| L003|     V003|      NULL|     NULL|2024-05-01 09:00:00|2024-05-01 09:18:00|       Bike|    55.0|    30.0|
| L004|     V004|      NULL|     NULL|2024-05-01 09:15:00|2024-05-01 09:35:00|        Car|    80.0|    50.0|
| L005|     V005|      NULL|     NULL|2024-05-01 10:05:00|2024-05-01 10:40:00|        Bus|    40.0|    70.0|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+



**Derived Column Creation**

In [21]:
# Calculate TripDurationMinutes = ExitTime - EntryTime
from pyspark.sql.functions import unix_timestamp

df_drived = df_traffic.withColumn("TripDurationMinutes",(unix_timestamp("ExitTime") - unix_timestamp("EntryTime"))/60)
df_drived.show()

+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+
|LogID|VehicleID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid|TripDurationMinutes|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+
| L001|     V001|      NULL|     NULL|2024-05-01 08:01:00|2024-05-01 08:20:00|        Car|    60.0|    50.0|               19.0|
| L002|     V002|      NULL|     NULL|2024-05-01 08:10:00|2024-05-01 08:45:00|      Truck|    45.0|   100.0|               35.0|
| L003|     V003|      NULL|     NULL|2024-05-01 09:00:00|2024-05-01 09:18:00|       Bike|    55.0|    30.0|               18.0|
| L004|     V004|      NULL|     NULL|2024-05-01 09:15:00|2024-05-01 09:35:00|        Car|    80.0|    50.0|               20.0|
| L005|     V005|      NULL|     NULL|2024-05-01 10:05:00|2024-05-01 10:40:00|        Bus|    40.

In [22]:
# Add IsOverspeed = SpeedKMH > 60
from pyspark.sql.functions import col, when

df_drived = df_drived.withColumn("IsOverspeed",when(col("SpeedKMH") > 60, True).otherwise(False))
df_drived.show()

+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+
|LogID|VehicleID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid|TripDurationMinutes|IsOverspeed|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+
| L001|     V001|      NULL|     NULL|2024-05-01 08:01:00|2024-05-01 08:20:00|        Car|    60.0|    50.0|               19.0|      false|
| L002|     V002|      NULL|     NULL|2024-05-01 08:10:00|2024-05-01 08:45:00|      Truck|    45.0|   100.0|               35.0|      false|
| L003|     V003|      NULL|     NULL|2024-05-01 09:00:00|2024-05-01 09:18:00|       Bike|    55.0|    30.0|               18.0|      false|
| L004|     V004|      NULL|     NULL|2024-05-01 09:15:00|2024-05-01 09:35:00|        Car|    80.0|    50.0|               20.0|       true|
| L005|     V

**Vehicle Behavior Aggregations**

In [10]:
# Average speed per VehicleType
from pyspark.sql.functions import avg

df_traffic.groupBy("VehicleType").agg(avg("SpeedKMH").alias("AverageSpeed")).show()

+-----------+------------+
|VehicleType|AverageSpeed|
+-----------+------------+
|       Bike|        55.0|
|        Car|        70.0|
|      Truck|        45.0|
|        Bus|        40.0|
+-----------+------------+



In [11]:
# Total toll collected per gate (EntryPoint)
from pyspark.sql.functions import sum

df_traffic.groupBy("EntryPoint").agg(sum("TollPaid").alias("TotalToll")).show()

+----------+---------+
|EntryPoint|TotalToll|
+----------+---------+
|      NULL|    300.0|
+----------+---------+



In [14]:
# Most used ExitPoint
from pyspark.sql.functions import count

df_traffic.groupBy("ExitPoint").agg(count('*').alias("ExitPoint")).show()

+---------+---------+
|ExitPoint|ExitPoint|
+---------+---------+
|     NULL|        5|
+---------+---------+



**Window Functions**

In [16]:
# Rank vehicles by speed within VehicleType
from pyspark.sql.window import Window
from pyspark.sql.functions import dense_rank, col

speed_window = Window.partitionBy("VehicleType").orderBy(col("SpeedKMH").desc())
df_ranked = df_traffic.withColumn("SpeedRank",dense_rank().over(speed_window))
df_ranked.show()

+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+---------+
|LogID|VehicleID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid|SpeedRank|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+---------+
| L003|     V003|      NULL|     NULL|2024-05-01 09:00:00|2024-05-01 09:18:00|       Bike|    55.0|    30.0|        1|
| L005|     V005|      NULL|     NULL|2024-05-01 10:05:00|2024-05-01 10:40:00|        Bus|    40.0|    70.0|        1|
| L004|     V004|      NULL|     NULL|2024-05-01 09:15:00|2024-05-01 09:35:00|        Car|    80.0|    50.0|        1|
| L001|     V001|      NULL|     NULL|2024-05-01 08:01:00|2024-05-01 08:20:00|        Car|    60.0|    50.0|        2|
| L002|     V002|      NULL|     NULL|2024-05-01 08:10:00|2024-05-01 08:45:00|      Truck|    45.0|   100.0|        1|
+-----+---------+----------+---------+----------

In [17]:
# Find last exit time for each vehicle using lag()
from pyspark.sql.functions import lag

exit_window = Window.partitionBy("VehicleID").orderBy("ExitTime")
df_lagged = df_traffic.withColumn("PreviousExitTime", lag("ExitTime").over(exit_window))
df_lagged.show()

+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+----------------+
|LogID|VehicleID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid|PreviousExitTime|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+----------------+
| L001|     V001|      NULL|     NULL|2024-05-01 08:01:00|2024-05-01 08:20:00|        Car|    60.0|    50.0|            NULL|
| L002|     V002|      NULL|     NULL|2024-05-01 08:10:00|2024-05-01 08:45:00|      Truck|    45.0|   100.0|            NULL|
| L003|     V003|      NULL|     NULL|2024-05-01 09:00:00|2024-05-01 09:18:00|       Bike|    55.0|    30.0|            NULL|
| L004|     V004|      NULL|     NULL|2024-05-01 09:15:00|2024-05-01 09:35:00|        Car|    80.0|    50.0|            NULL|
| L005|     V005|      NULL|     NULL|2024-05-01 10:05:00|2024-05-01 10:40:00|        Bus|    40.0|    70.0|          

**Session Segmentation**

In [18]:
# Group by VehicleID to simulate route sessions
from pyspark.sql.window import Window
from pyspark.sql.functions import lag, col

session_window = Window.partitionBy("VehicleID").orderBy("EntryTime")
df_session = df_traffic.withColumn("PreviousExitTime", lag("ExitTime").over(session_window))
df_session.show()

+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+----------------+
|LogID|VehicleID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid|PreviousExitTime|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+----------------+
| L001|     V001|      NULL|     NULL|2024-05-01 08:01:00|2024-05-01 08:20:00|        Car|    60.0|    50.0|            NULL|
| L002|     V002|      NULL|     NULL|2024-05-01 08:10:00|2024-05-01 08:45:00|      Truck|    45.0|   100.0|            NULL|
| L003|     V003|      NULL|     NULL|2024-05-01 09:00:00|2024-05-01 09:18:00|       Bike|    55.0|    30.0|            NULL|
| L004|     V004|      NULL|     NULL|2024-05-01 09:15:00|2024-05-01 09:35:00|        Car|    80.0|    50.0|            NULL|
| L005|     V005|      NULL|     NULL|2024-05-01 10:05:00|2024-05-01 10:40:00|        Bus|    40.0|    70.0|          

In [19]:
# Find duration between subsequent trips (idle time)
from pyspark.sql.functions import unix_timestamp

df_session = df_session.withColumn("IdleTimeMinutes", (unix_timestamp("EntryTime") - unix_timestamp("PreviousExitTime")) / 60)
df_session.show()

+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+----------------+---------------+
|LogID|VehicleID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid|PreviousExitTime|IdleTimeMinutes|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+----------------+---------------+
| L001|     V001|      NULL|     NULL|2024-05-01 08:01:00|2024-05-01 08:20:00|        Car|    60.0|    50.0|            NULL|           NULL|
| L002|     V002|      NULL|     NULL|2024-05-01 08:10:00|2024-05-01 08:45:00|      Truck|    45.0|   100.0|            NULL|           NULL|
| L003|     V003|      NULL|     NULL|2024-05-01 09:00:00|2024-05-01 09:18:00|       Bike|    55.0|    30.0|            NULL|           NULL|
| L004|     V004|      NULL|     NULL|2024-05-01 09:15:00|2024-05-01 09:35:00|        Car|    80.0|    50.0|            NULL|           NULL|
| L005

**Anomaly Detection**

In [26]:
# Identify vehicles with speed > 70 and TripDuration < 10 minutes
df_drived.filter((col("SpeedKMH") > 70) & (col("TripDurationMinutes") < 10)).show()

+-----+---------+----------+---------+---------+--------+-----------+--------+--------+-------------------+-----------+
|LogID|VehicleID|EntryPoint|ExitPoint|EntryTime|ExitTime|VehicleType|SpeedKMH|TollPaid|TripDurationMinutes|IsOverspeed|
+-----+---------+----------+---------+---------+--------+-----------+--------+--------+-------------------+-----------+
+-----+---------+----------+---------+---------+--------+-----------+--------+--------+-------------------+-----------+



In [28]:
# Vehicles that paid less toll for longer trips

df_drived.filter((col("TripDurationMinutes") > 30) & (col("TollPaid") < 40)).show()

+-----+---------+----------+---------+---------+--------+-----------+--------+--------+-------------------+-----------+
|LogID|VehicleID|EntryPoint|ExitPoint|EntryTime|ExitTime|VehicleType|SpeedKMH|TollPaid|TripDurationMinutes|IsOverspeed|
+-----+---------+----------+---------+---------+--------+-----------+--------+--------+-------------------+-----------+
+-----+---------+----------+---------+---------+--------+-----------+--------+--------+-------------------+-----------+



In [30]:
# Suspicious backtracking (ExitPoint earlier than EntryPoint)

df_drived.filter(col("ExitPoint") < col("EntryPoint")).show()

+-----+---------+----------+---------+---------+--------+-----------+--------+--------+-------------------+-----------+
|LogID|VehicleID|EntryPoint|ExitPoint|EntryTime|ExitTime|VehicleType|SpeedKMH|TollPaid|TripDurationMinutes|IsOverspeed|
+-----+---------+----------+---------+---------+--------+-----------+--------+--------+-------------------+-----------+
+-----+---------+----------+---------+---------+--------+-----------+--------+--------+-------------------+-----------+



**Join with Metadata**

In [31]:
# Prepare this small vehicle_registry.csv :
# VehicleID,OwnerName,Model,RegisteredCity
# V001,Anil,Hyundai i20,Delhi
# V002,Rakesh,Tata Truck,Chennai
# V003,Sana,Yamaha R15,Mumbai
# V004,Neha,Honda City,Bangalore
# V005,Zoya,Volvo Bus,Pune

df_reg = spark.read.csv('/content/drive/MyDrive/vehicle_registry.csv',header = True,inferSchema = True)
df_reg.show()

+---------+---------+-----------+--------------+
|VehicleID|OwnerName|      Model|RegisteredCity|
+---------+---------+-----------+--------------+
|     V001|     Anil|Hyundai i20|         Delhi|
|     V002|   Rakesh| Tata Truck|       Chennai|
|     V003|     Sana| Yamaha R15|        Mumbai|
|     V004|     Neha| Honda City|     Bangalore|
|     V005|     Zoya|  Volvo Bus|          Pune|
+---------+---------+-----------+--------------+



In [32]:
# Join and group trips by RegisteredCity
df_joined = df_drived.join(df_reg, on= "VehicleID",how = "inner")

from pyspark.sql.functions import count
df_joined.groupBy("RegisteredCity").agg(count("*").alias("TripCount")).show()

+--------------+---------+
|RegisteredCity|TripCount|
+--------------+---------+
|     Bangalore|        1|
|       Chennai|        1|
|        Mumbai|        1|
|          Pune|        1|
|         Delhi|        1|
+--------------+---------+



**Delta Lake Features**

In [None]:
# Save traffic_logs as Delta Table

df_drived.write.format("delta").mode("overwrite").save("/mnt/datalake/traffic_logs_delta")

In [None]:
# Apply MERGE INTO to update toll rates for all Bikes
from delta.tables import DeltaTable
from pyspark.sql.functions import lit

bike_updates = df_drived.filter(col("VehicleType") == "Bike").withColumn("TollPaid", lit(40))

delta_table = DeltaTable.forName(spark, "traffic_logs")

# Merge operation
delta_table.alias("target").merge(
    bike_updates.alias("source"),
    "target.LogID = source.LogID"
).whenMatchedUpdate(set={"TollPaid": "source.TollPaid"}).execute()

In [None]:
# Delete trips longer than 60 minutes

delta_table.delete("TripDurationMinutes > 60")

In [None]:
# Use DESCRIBE HISTORY and VERSION AS OF

df_old = spark.read.format("delta").option("versionAsOf", 0).table("traffic_logs")
df_old.show()

**Advanced Conditions**


In [34]:
# when/otherwise : Tag trip type as:
# "Short" <15min
# "Medium" 15-30min
# "Long" >30min
from pyspark.sql.functions import when

df_tagged = df_drived.withColumn(
    "TripType",when(col("TripDurationMinutes") < 15, "Short").when((col("TripDurationMinutes") >= 15) & (col("TripDurationMinutes") <= 30), "Medium")
    .otherwise("Long")
)

In [42]:
# Flag vehicles with more than 3 trips in a day
from pyspark.sql.functions import to_date
from pyspark.sql.functions import count
from pyspark.sql.window import Window

df_datewise = df_tagged.withColumn("TripDate", to_date(col("EntryTime")))

trip_window = Window.partitionBy("VehicleID", "TripDate")

df_flagged = df_datewise.withColumn("TripCountPerDay", count("*").over(trip_window)).withColumn(
    "FrequentFlag", when(col("TripCountPerDay") > 3, True).otherwise(False)
)

In [43]:
df_flagged.write.mode("overwrite").parquet("/content/drive/MyDrive/enriched_traffic_logs_parquet")

In [44]:
df_flagged.write.option("header", True).mode("overwrite").csv("/content/drive/MyDrive/enriched_traffic_logs_csv")

In [45]:
df_flagged.createOrReplaceTempView("enriched_traffic_view")

spark.sql("""
    CREATE OR REPLACE TEMP VIEW toll_summary AS
    SELECT VehicleType, ExitPoint, SUM(TollPaid) AS TotalToll
    FROM enriched_traffic_view
    GROUP BY VehicleType, ExitPoint
""")

spark.sql("SELECT * FROM toll_summary ORDER BY TotalToll DESC").show()

+-----------+---------+---------+
|VehicleType|ExitPoint|TotalToll|
+-----------+---------+---------+
|        Car|     NULL|    100.0|
|      Truck|     NULL|    100.0|
|        Bus|     NULL|     70.0|
|       Bike|     NULL|     30.0|
+-----------+---------+---------+

