In [38]:
from pyspark.sql import functions as f

s3_input = "s3://hw18-part1/data/Airline_Delay_Cause.csv"   
df = spark.read.option("header", "true").option("inferSchema", "true").csv(s3_input)

df.printSchema()
df.show(10, truncate=False)

Calculation started (calculation_id=66cc88e2-064b-405f-6fd1-0734f52adcca) in (session=6acc88c6-1880-c311-4c40-559f3e3797fe). Checking calculation status...


Progress:   0%|          |elapsed time = 00:00s

Calculation completed.
root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- carrier_name: string (nullable = true)
 |-- airport: string (nullable = true)
 |-- airport_name: string (nullable = true)
 |-- arr_flights: double (nullable = true)
 |-- arr_del15: double (nullable = true)
 |-- carrier_ct: double (nullable = true)
 |-- weather_ct: double (nullable = true)
 |-- nas_ct: double (nullable = true)
 |-- security_ct: double (nullable = true)
 |-- late_aircraft_ct: double (nullable = true)
 |-- arr_cancelled: double (nullable = true)
 |-- arr_diverted: double (nullable = true)
 |-- arr_delay: double (nullable = true)
 |-- carrier_delay: double (nullable = true)
 |-- weather_delay: double (nullable = true)
 |-- nas_delay: double (nullable = true)
 |-- security_delay: double (nullable = true)
 |-- late_aircraft_delay: double (nullable = true)

+----+-----+-------+-----------------+-------+-----------------------------

In [9]:
# describe the statistical properties
desc_df = df.describe()
desc_df.show()

Calculation started (calculation_id=9acc8856-603d-c860-3b10-ef09245ddf1e) in (session=12cc884b-095f-373f-85cc-970d4d429cdd). Checking calculation status...


Progress:   0%|          |elapsed time = 00:00s

Calculation completed.
+-------+-----------------+------------------+-------+--------------------+-------+--------------------+------------------+-----------------+------------------+------------------+------------------+-------------------+------------------+-----------------+------------------+------------------+------------------+------------------+-----------------+-----------------+-------------------+
|summary|             year|             month|carrier|        carrier_name|airport|        airport_name|       arr_flights|        arr_del15|        carrier_ct|        weather_ct|            nas_ct|        security_ct|  late_aircraft_ct|    arr_cancelled|      arr_diverted|         arr_delay|     carrier_delay|     weather_delay|        nas_delay|   security_delay|late_aircraft_delay|
+-------+-----------------+------------------+-------+--------------------+-------+--------------------+------------------+-----------------+------------------+------------------+------------------+---

In [17]:
# identify the airlines with top 5 delays
delays_df = (df.groupBy("carrier_name")
              .agg(f.avg("arr_delay").alias("Average_Delays"))
              .orderBy(f.desc(f.col("Average_Delays"))).limit(5))

delays_df.show()

Calculation started (calculation_id=d8cc88cf-3da4-7d40-3919-3c0e0ab71bd0) in (session=6acc88c6-1880-c311-4c40-559f3e3797fe). Checking calculation status...


Progress:   0%|          |elapsed time = 00:00s

Calculation completed.
+--------------------+------------------+
|        carrier_name|    Average_Delays|
+--------------------+------------------+
|Southwest Airline...|11188.673778017823|
|American Airlines...| 8782.900203148807|
|United Air Lines ...| 6248.942300853485|
|     JetBlue Airways| 6091.326271186441|
|    Spirit Air Lines| 4808.278416347382|
+--------------------+------------------+



In [23]:
# discover the top three airports with the most cancellations
airports_df = (df.groupBy("airport","airport_name")
                 .agg(f.sum("arr_cancelled").alias("Cancellation_Count"))
                 .orderBy(f.desc(f.col("Cancellation_Count")))
                 .limit(3)
)
               
airports_df.show()

Calculation started (calculation_id=5ccc88d4-1917-0c32-9179-5fb3cc99560d) in (session=6acc88c6-1880-c311-4c40-559f3e3797fe). Checking calculation status...


Progress:   0%|          |elapsed time = 00:00s

Calculation completed.
+-------+--------------------+------------------+
|airport|        airport_name|Cancellation_Count|
+-------+--------------------+------------------+
|    ORD|Chicago, IL: Chic...|           80821.0|
|    DFW|Dallas/Fort Worth...|           70542.0|
|    LGA|New York, NY: LaG...|           46760.0|
+-------+--------------------+------------------+



In [40]:
# determine the most common reason for flight delays
reason_df = (df.select(
    f.sum("carrier_delay").alias("Carrier"), 
    f.sum("weather_delay").alias("Weather"),
    f.sum("nas_delay").alias("NAS"),
    f.sum("security_delay").alias("Security"),
    f.sum("late_aircraft_delay").alias("Late_Aircraft"))
)

reason_long = (
    reason_df.selectExpr(
        "stack(5,"
        " 'Carrier', Carrier,"
        " 'Weather', Weather,"
        " 'NAS', NAS,"
        " 'Security', Security,"
        " 'Late_Aircraft', Late_Aircraft"
        ") as (reason, total_delay)"
    )
    .orderBy(f.desc("total_delay"))
)

reason_long.show()
reason_long.limit(1).show()

Calculation started (calculation_id=26cc88e2-734a-426c-5d7e-21b9884236c5) in (session=6acc88c6-1880-c311-4c40-559f3e3797fe). Checking calculation status...


Progress:   0%|          |elapsed time = 00:00s

Calculation completed.
+-------------+------------+
|       reason| total_delay|
+-------------+------------+
|Late_Aircraft|2.83144335E8|
|      Carrier|2.46370897E8|
|          NAS|1.57823639E8|
|      Weather|  3.815317E7|
|     Security|   1265591.0|
+-------------+------------+

+-------------+------------+
|       reason| total_delay|
+-------------+------------+
|Late_Aircraft|2.83144335E8|
+-------------+------------+

