In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [0]:
df_silver=spark.table('nyc_taxi.nyc_schema.silver')

In [0]:
# defining window by driver
driver_window=Window.partitionBy('VendorID').orderBy('lpep_pickup_datetime')

In [0]:
# lag to get previous dropoff time
df_with_lag = df_silver.withColumn(
    "prev_dropoff_time",
    lag("lpep_dropoff_datetime").over(driver_window)
)

In [0]:
# calculating wait minutes
df_with_wait = df_with_lag.withColumn(
    "wait_minutes",
    (unix_timestamp("lpep_pickup_datetime") - unix_timestamp("prev_dropoff_time")) / 60
)

In [0]:
# filtering nulls and ouliers(assuming next shift starts if wait>6 hours)
df_valid = df_with_wait.filter((col("wait_minutes").isNotNull()) & (col("wait_minutes") <= 360)& (col("wait_minutes") >= 0))

In [0]:
# final grouping
df_driver_wait = df_valid.withColumn("trip_date", to_date("lpep_pickup_datetime")) \
    .groupBy("VendorID", "trip_date") \
    .agg(
        avg("wait_minutes").alias("avg_wait_minutes"),
        count("*").alias("trip_count")
    ).orderBy("trip_date", "VendorID")

display(df_driver_wait)

In [0]:
df_driver_wait.write.mode("overwrite").saveAsTable("nyc_taxi.nyc_schema.driver_wait_time_analysis")

In [0]:
%sql
select * from nyc_taxi.nyc_schema.driver_wait_time_analysis