In [0]:
# Load KPI-5 base dataset
from pyspark.sql.functions import col

df_base = spark.table("kpi5_airline_base")

df_base.show(5)


+----------+---------+---------+--------+-------+-----+-----------+--------------------+----------------+----------------+---------------+--------------+
|op_carrier|arr_delay|dep_delay|taxi_out|taxi_in|month|day_of_week|        airline_name|arr_delay_capped|dep_delay_capped|taxi_out_capped|taxi_in_capped|
+----------+---------+---------+--------+-------+-----+-----------+--------------------+----------------+----------------+---------------+--------------+
|        AA|      3.0|      4.0|    29.0|   15.0|    1|          3|   American Airlines|             3.0|             4.0|           29.0|          15.0|
|        AS|     -7.0|    -10.0|    12.0|   11.0|    1|          3|     Alaska Airlines|             0.0|             0.0|           12.0|          11.0|
|        DL|     14.0|      0.0|    56.0|    5.0|    1|          3|     Delta Air Lines|            14.0|             0.0|           56.0|           5.0|
|        EV|    -16.0|     -6.0|    11.0|    6.0|    1|          3| ExpressJ

In [0]:
# Compute delay consistency metrics at airline level
from pyspark.sql.functions import avg, stddev, variance, count

df_kpi5b = df_base.groupBy(
    "op_carrier",
    "airline_name"
).agg(
    count("*").alias("total_flights"),
    avg("arr_delay_capped").alias("avg_arrival_delay"),
    stddev("arr_delay_capped").alias("stddev_arrival_delay"),
    variance("arr_delay_capped").alias("var_arrival_delay"),
    avg("dep_delay_capped").alias("avg_departure_delay"),
    stddev("dep_delay_capped").alias("stddev_departure_delay"),
    variance("dep_delay_capped").alias("var_departure_delay")
)

df_kpi5b.show(5)


+----------+--------------------+-------------+-----------------+--------------------+------------------+-------------------+----------------------+-------------------+
|op_carrier|        airline_name|total_flights|avg_arrival_delay|stddev_arrival_delay| var_arrival_delay|avg_departure_delay|stddev_departure_delay|var_departure_delay|
+----------+--------------------+-------------+-----------------+--------------------+------------------+-------------------+----------------------+-------------------+
|        AA|   American Airlines|       534592|7.035722569735424|   25.95016777321157| 673.4112074578284|  6.761732311744283|    26.315392065228902|  692.4998595467123|
|        AS|     Alaska Airlines|       132217|6.280016941845602|  21.193005732041133|449.14349195832835|    5.6710786056256|    21.429104382209992| 459.20651462365146|
|        DL|     Delta Air Lines|       551516|5.841536782251104|  23.514489741702985| 552.9312278126549|   5.19584019321289|    23.472043765304623|  550.9

In [0]:
# Keep airlines with sufficient data volume for reliable variance
df_kpi5b = df_kpi5b.filter(col("total_flights") >= 500)

df_kpi5b.show(5)


+----------+--------------------+-------------+-----------------+--------------------+------------------+-------------------+----------------------+-------------------+
|op_carrier|        airline_name|total_flights|avg_arrival_delay|stddev_arrival_delay| var_arrival_delay|avg_departure_delay|stddev_departure_delay|var_departure_delay|
+----------+--------------------+-------------+-----------------+--------------------+------------------+-------------------+----------------------+-------------------+
|        AA|   American Airlines|       534592|7.035722569735424|   25.95016777321157| 673.4112074578284|  6.761732311744283|    26.315392065228902|  692.4998595467123|
|        AS|     Alaska Airlines|       132217|6.280016941845602|  21.193005732041133|449.14349195832835|    5.6710786056256|    21.429104382209992| 459.20651462365146|
|        DL|     Delta Air Lines|       551516|5.841536782251104|  23.514489741702985| 552.9312278126549|   5.19584019321289|    23.472043765304623|  550.9

In [0]:
# Create consistency score (lower stddev = higher score)
from pyspark.sql.functions import max as spark_max

max_stddev = df_kpi5b.agg(
    spark_max("stddev_arrival_delay")
).collect()[0][0]

df_kpi5b = df_kpi5b.withColumn(
    "arrival_consistency_score",
    1 - (col("stddev_arrival_delay") / max_stddev)
)

df_kpi5b.show(5)


+----------+--------------------+-------------+-----------------+--------------------+------------------+-------------------+----------------------+-------------------+-------------------------+
|op_carrier|        airline_name|total_flights|avg_arrival_delay|stddev_arrival_delay| var_arrival_delay|avg_departure_delay|stddev_departure_delay|var_departure_delay|arrival_consistency_score|
+----------+--------------------+-------------+-----------------+--------------------+------------------+-------------------+----------------------+-------------------+-------------------------+
|        AA|   American Airlines|       534592|7.035722569735424|   25.95016777321157| 673.4112074578284|  6.761732311744283|    26.315392065228902|  692.4998595467123|       0.3209645707153064|
|        AS|     Alaska Airlines|       132217|6.280016941845602|  21.193005732041133|449.14349195832835|    5.6710786056256|    21.429104382209992| 459.20651462365146|      0.44544475123027194|
|        DL|     Delta Ai

In [0]:
# Save KPI-5B airline delay consistency table
df_kpi5b.write \
    .mode("overwrite") \
    .saveAsTable("kpi5b_airline_delay_consistency")

# Verify
spark.table("kpi5b_airline_delay_consistency").show(5)


+----------+--------------------+-------------+-----------------+--------------------+------------------+-------------------+----------------------+-------------------+-------------------------+
|op_carrier|        airline_name|total_flights|avg_arrival_delay|stddev_arrival_delay| var_arrival_delay|avg_departure_delay|stddev_departure_delay|var_departure_delay|arrival_consistency_score|
+----------+--------------------+-------------+-----------------+--------------------+------------------+-------------------+----------------------+-------------------+-------------------------+
|        B6|     JetBlue Airways|       135869|8.265652945116251|  27.858165417301592| 776.0773804177384|  7.937182138677697|     27.93952111684656|  780.6168402387148|       0.2710381883253513|
|        AS|     Alaska Airlines|       132217|6.280016941845602|  21.193005732041133|449.14349195832835|    5.6710786056256|    21.429104382209992| 459.20651462365146|      0.44544475123027194|
|        MQ|Envoy Air (Am