In [0]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import approx_count_distinct, skewness, sum, avg, stddev

# Load the csv into DataFrame
flight_summary = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("dbfs:/FileStore/flight_summary.csv")
flight_summary.show(5)


+-----------+--------------------+------------+------------+---------+--------------------+---------+----------+-----+
|origin_code|      origin_airport| origin_city|origin_state|dest_code|        dest_airport|dest_city|dest_state|count|
+-----------+--------------------+------------+------------+---------+--------------------+---------+----------+-----+
|        BQN|Rafael Hernández ...|   Aguadilla|          PR|      MCO|Orlando Internati...|  Orlando|        FL|  441|
|        PHL|Philadelphia Inte...|Philadelphia|          PA|      MCO|Orlando Internati...|  Orlando|        FL| 4869|
|        MCI|Kansas City Inter...| Kansas City|          MO|      IAH|George Bush Inter...|  Houston|        TX| 1698|
|        SPI|Abraham Lincoln C...| Springfield|          IL|      ORD|Chicago O'Hare In...|  Chicago|        IL|  998|
|        SNA|John Wayne Airpor...|   Santa Ana|          CA|      PHX|Phoenix Sky Harbo...|  Phoenix|        AZ| 3846|
+-----------+--------------------+------------+-

In [0]:
# Number of unique origin airports
unique_origin_airports = flight_summary.select("origin_airport").distinct().count()
print(f"Number of unique origin airports: {unique_origin_airports}")


Number of unique origin airports: 322


In [0]:
# Margin of error of 10%
approx_unique_origin_airports = flight_summary.select(approx_count_distinct("origin_airport", 0.1))
approx_unique_origin_airports.show()

# approx_count_distinct function is used to improve efficiency. Focuses more on performence than accuracy.


+-------------------------------------+
|approx_count_distinct(origin_airport)|
+-------------------------------------+
|                                  346|
+-------------------------------------+



In [0]:
# Outputs skewness of the "count" column
skewness_result = flight_summary.select(skewness("count"))
skewness_result.show()

# What does the function skewness determine
# Skewness measures the asymmetry of a data distributiuon. Close to 0 is approximately symmetric, positive skew means tail long on the right, negative means tail long on the left.

# What does the result indicate?
# The result indicates that this is a positive right skew. Most values in the count column are on the lower end but there are ouitliers which would make the tail to the right.

+-----------------+
|  skewness(count)|
+-----------------+
|2.682183800064101|
+-----------------+



In [0]:
# Outputs the top 5 most popular destination cities. You will need to group the data by destination state and destination city
top_5_destinations = flight_summary.groupBy("dest_state", "dest_city").sum("count").orderBy("sum(count)", ascending=False).limit(5)
top_5_destinations.show()


+----------+-----------------+----------+
|dest_state|        dest_city|sum(count)|
+----------+-----------------+----------+
|        IL|          Chicago|    366790|
|        GA|          Atlanta|    346904|
|        TX|Dallas-Fort Worth|    239582|
|        TX|          Houston|    198724|
|        CO|           Denver|    196010|
+----------+-----------------+----------+



In [0]:
# Groups the data by each origin airport and outputs the sum, average and standard deviation of the count column. Use the “.agg” function 
airport_stats = flight_summary.groupBy("origin_airport").agg(sum("count").alias("total_flights"),avg("count").alias("average_flights"),stddev("count").alias("stddev_flights"))
airport_stats.show()


+--------------------+-------------+------------------+------------------+
|      origin_airport|total_flights|   average_flights|    stddev_flights|
+--------------------+-------------+------------------+------------------+
|Melbourne Interna...|         1332|            1332.0|              null|
|San Diego Interna...|        70207|1526.2391304347825| 1575.804150052814|
|     Eppley Airfield|        16753| 797.7619047619048| 688.7479876385778|
|     Kahului Airport|        20627|1145.9444444444443|1899.6718438664834|
|Austin-Bergstrom ...|        42067|1026.0243902439024|1100.5043954434002|
|Port Columbus Int...|        24187| 863.8214285714286| 823.0782804934813|
|Waco Regional Air...|         1612|            1612.0|              null|
|Sacramento Intern...|        37212|1378.2222222222222|1327.6026204963282|
|Brownsville/South...|         2362|            1181.0| 172.5340546095176|
|       Meadows Field|         2637|            659.25| 160.1527916293271|
|Erie Internationa...|   