In [34]:
import pyspark
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyspark.sql import functions as func
from pyspark.sql.types import StringType,FloatType
import nltk
from pyspark.sql import SparkSession


In [76]:
# reading in flight data csv
spark = SparkSession.builder.master('local[*]').appName('FlightAnalysis').getOrCreate()
combinedFlightsDF = spark.read.options(header = True, inferSchema = True).csv("FlightData")

                                                                                

In [77]:
# drops unnecessary columns
drop_cols = ("FlightDate","Origin","Dest","Diverted","DepDelayMinutes","ArrDelayMinutes","IATA_Code_Operating_Airline","DestAirportSeqID","DestCityMarketID","DOT_ID_Operating_Airline","DOT_ID_Marketing_Airline","IATA_Code_Marketing_Airline","Operated_or_Branded_Code_Share_Partners","CRSDepTime","DepDelay","AirTime","CRSElapsedTime", "Distance", "Marketing_Airline_Network", "Flight_Number_Marketing_Airline", "Operating_Airline", "Tail_Number", "Flight_Number_Operating_Airline", "OriginAirportID",  "OriginAirportSeqID", "OriginCityMarketID",  "OriginState",  "OriginStateFips",  "OriginStateName", "OriginWac", "DestAirportID", "DestState", "DestStateFips", "DestStateName", "DestWac", "DepartureDelayGroups", "DepTimeBlk", "TaxiOut", "WheelsOff", "WheelsOn", "TaxiIn", "CRSArrTime", "ArrDelay", "ArrivalDelayGroups", "ArrTimeBlk", "DistanceGroup", "DivAirportLandings")
combinedFlightsDF = combinedFlightsDF.drop(*drop_cols)

In [78]:
combinedFlightsDF.show(5)

+---------+---------+-------+-------+-----------------+----+-------+-----+----------+---------+---------------+------------+--------+--------+
|  Airline|Cancelled|DepTime|ArrTime|ActualElapsedTime|Year|Quarter|Month|DayofMonth|DayOfWeek| OriginCityName|DestCityName|DepDel15|ArrDel15|
+---------+---------+-------+-------+-----------------+----+-------+-----+----------+---------+---------------+------------+--------+--------+
|Envoy Air|    false| 1209.0| 1350.0|            101.0|2019|      2|    4|         1|        1|Little Rock, AR| Chicago, IL|     0.0|     0.0|
|Envoy Air|    false| 1200.0| 1348.0|            108.0|2019|      2|    4|         2|        2|Little Rock, AR| Chicago, IL|     0.0|     0.0|
|Envoy Air|    false| 1203.0| 1342.0|             99.0|2019|      2|    4|         3|        3|Little Rock, AR| Chicago, IL|     0.0|     0.0|
|Envoy Air|    false| 1435.0| 1621.0|            106.0|2019|      2|    4|         4|        4|Little Rock, AR| Chicago, IL|     1.0|     1.0|

In [79]:
searchBarDropCols = ("Cancelled","Quarter","DayOfWeek")
searchBarDF = combinedFlightsDF.drop(*searchBarDropCols)

In [80]:
searchBarDF.show(5)

+---------+-------+-------+-----------------+----+-----+----------+---------------+------------+--------+--------+
|  Airline|DepTime|ArrTime|ActualElapsedTime|Year|Month|DayofMonth| OriginCityName|DestCityName|DepDel15|ArrDel15|
+---------+-------+-------+-----------------+----+-----+----------+---------------+------------+--------+--------+
|Envoy Air| 1209.0| 1350.0|            101.0|2019|    4|         1|Little Rock, AR| Chicago, IL|     0.0|     0.0|
|Envoy Air| 1200.0| 1348.0|            108.0|2019|    4|         2|Little Rock, AR| Chicago, IL|     0.0|     0.0|
|Envoy Air| 1203.0| 1342.0|             99.0|2019|    4|         3|Little Rock, AR| Chicago, IL|     0.0|     0.0|
|Envoy Air| 1435.0| 1621.0|            106.0|2019|    4|         4|Little Rock, AR| Chicago, IL|     1.0|     1.0|
|Envoy Air| 1216.0| 1410.0|            114.0|2019|    4|         5|Little Rock, AR| Chicago, IL|     0.0|     0.0|
+---------+-------+-------+-----------------+----+-----+----------+-------------

In [None]:
#Creates csv with specified data (DONT RUN)
combinedFlightsDF.write.option("header",True).csv("combinedFlightsSearchBar")

In [82]:
#dont need to run
# Find # of flights per year
yearCount = combinedFlightsDF.groupBy("Year").count()
yearCount.show()



+----+-------+
|Year|  count|
+----+-------+
|2019|8091684|
|2018|5689521|
|2022|4078327|
|2020|5022397|
|2021|6311871|
+----+-------+



                                                                                

In [83]:
# creating new df to analyze delayed flights by airline, orgin city, and month/year
drop_cols = ("DestCityName", "DayOfMonth", "DepTime", "ArrTime", "ActualElapsedTime")
delayDF = combinedFlightsDF.drop(*drop_cols)
delayDF.show(5)

+---------+---------+----+-------+-----+---------+---------------+--------+--------+
|  Airline|Cancelled|Year|Quarter|Month|DayOfWeek| OriginCityName|DepDel15|ArrDel15|
+---------+---------+----+-------+-----+---------+---------------+--------+--------+
|Envoy Air|    false|2019|      2|    4|        1|Little Rock, AR|     0.0|     0.0|
|Envoy Air|    false|2019|      2|    4|        2|Little Rock, AR|     0.0|     0.0|
|Envoy Air|    false|2019|      2|    4|        3|Little Rock, AR|     0.0|     0.0|
|Envoy Air|    false|2019|      2|    4|        4|Little Rock, AR|     1.0|     1.0|
|Envoy Air|    false|2019|      2|    4|        5|Little Rock, AR|     0.0|     0.0|
+---------+---------+----+-------+-----+---------+---------------+--------+--------+
only showing top 5 rows



In [None]:
delayDF.columns

In [84]:
# uses bitwise or operator to find overall delayed flights
delayDF = delayDF.withColumn("ifDelayed", func.when((delayDF.ArrDel15 > 0) | (delayDF.DepDel15 > 0), 1).otherwise(0))
delayDF.show(10)

+---------+---------+----+-------+-----+---------+---------------+--------+--------+---------+
|  Airline|Cancelled|Year|Quarter|Month|DayOfWeek| OriginCityName|DepDel15|ArrDel15|ifDelayed|
+---------+---------+----+-------+-----+---------+---------------+--------+--------+---------+
|Envoy Air|    false|2019|      2|    4|        1|Little Rock, AR|     0.0|     0.0|        0|
|Envoy Air|    false|2019|      2|    4|        2|Little Rock, AR|     0.0|     0.0|        0|
|Envoy Air|    false|2019|      2|    4|        3|Little Rock, AR|     0.0|     0.0|        0|
|Envoy Air|    false|2019|      2|    4|        4|Little Rock, AR|     1.0|     1.0|        1|
|Envoy Air|    false|2019|      2|    4|        5|Little Rock, AR|     0.0|     0.0|        0|
|Envoy Air|    false|2019|      2|    4|        6|Little Rock, AR|     1.0|     1.0|        1|
|Envoy Air|    false|2019|      2|    4|        7|Little Rock, AR|     0.0|     0.0|        0|
|Envoy Air|    false|2019|      2|    4|        1|

In [85]:
# groups delayed flights by airline
numDelayedByAirline = delayDF.groupBy("Airline").agg({"ifDelayed" : "sum"})
numDelayedByAirline = numDelayedByAirline.withColumnRenamed("sum(ifDelayed)", "sumDelayedByAirline")
numDelayedByAirline.show(30,truncate=False)

# to check if delayed flight per airline add up to original amount
# sumdf = numDelayed.agg({"sum(ifDelayed)" : "sum"})
# sumdf.show()



+-----------------------------------------+-------------------+
|Airline                                  |sumDelayedByAirline|
+-----------------------------------------+-------------------+
|Endeavor Air Inc.                        |147359             |
|United Air Lines Inc.                    |510700             |
|Compass Airlines                         |31705              |
|Comair Inc.                              |188171             |
|Southwest Airlines Co.                   |1269581            |
|ExpressJet Airlines Inc.                 |79802              |
|JetBlue Airways                          |325465             |
|Empire Airlines Inc.                     |3665               |
|Envoy Air                                |205502             |
|Capital Cargo International              |68581              |
|Hawaiian Airlines Inc.                   |39344              |
|Mesa Airlines Inc.                       |155645             |
|American Airlines Inc.                 

                                                                                

In [86]:
#total flights for each of the 28 airlines
airlineRank = combinedFlightsDF.groupBy("Airline").count() #28 airlines
airlineRank = airlineRank.withColumnRenamed("count", "totalFlightsByAirline")

In [None]:
airlineRank.show()

In [87]:
#1 add a new column to show ratio of delayed flights for each airline
numDelayedByAirline = numDelayedByAirline\
    .join(airlineRank, "Airline")\
    .withColumn("delayedRatio", ((func.col("SumDelayedByAirline") / func.col("totalFlightsByAirline")) * 100))\
    .drop("count")

In [88]:
#1
numDelayedByAirline.show(10)



+--------------------+-------------------+------------+------------------+
|             Airline|sumDelayedByAirline|totalFlights|      delayedRatio|
+--------------------+-------------------+------------+------------------+
|   Endeavor Air Inc.|             147359|      998233|14.761984426481591|
|United Air Lines ...|             510700|     2354538|21.690030061099037|
|    Compass Airlines|              31705|      154985| 20.45681840178082|
|         Comair Inc.|             188171|      957220|19.658072334468564|
|Southwest Airline...|            1269581|     5474339|23.191493986762605|
|ExpressJet Airlin...|              79802|      353669|22.564035864042367|
|     JetBlue Airways|             325465|     1106079|29.425113396059416|
|Empire Airlines Inc.|               3665|       23122|15.850704956318657|
|           Envoy Air|             205502|     1072778|19.156060247320507|
|Capital Cargo Int...|              68581|      392011|17.494662139582818|
+--------------------+---

                                                                                

In [91]:
#total flights for each of the origin cities
cityRank = combinedFlightsDF.groupBy("OriginCityName").count() #28 airlines
cityRank = cityRank.withColumnRenamed("count", "totalFlightsByCity")

In [89]:
# groups delayed flights by city
numDelayedByCity = delayDF.groupBy("OriginCityName").agg({"ifDelayed" : "sum"})
numDelayedByCity = numDelayedByCity.withColumnRenamed("sum(ifDelayed)", "sumDelayedByCity")
numDelayedByCity.show(30,truncate=False)



+------------------------+----------------+
|OriginCityName          |sumDelayedByCity|
+------------------------+----------------+
|Gainesville, FL         |2847            |
|Richmond, VA            |19749           |
|Ontario, CA             |15887           |
|Pago Pago, TT           |113             |
|Tucson, AZ              |11284           |
|Myrtle Beach, SC        |10761           |
|Medford, OR             |5562            |
|Palm Springs, CA        |9311            |
|Durango, CO             |3502            |
|Corpus Christi, TX      |3734            |
|Mobile, AL              |4294            |
|Dubuque, IA             |442             |
|Pensacola, FL           |8954            |
|Huntsville, AL          |6572            |
|Fort Myers, FL          |31883           |
|Columbus, GA            |894             |
|Springfield, IL         |1419            |
|San Juan, PR            |27812           |
|Montrose/Delta, CO      |2692            |
|Lihue, HI               |6384  

                                                                                

In [93]:
#2 add a new column to show ratio of delayed flights for each airline
numDelayedByCity = numDelayedByCity\
    .join(cityRank, "OriginCityName")\
    .withColumn("delayedRatio", ((func.col("SumDelayedByCity") / func.col("totalFlightsByCity")) * 100))\
    .drop("count")

In [94]:
#2
numDelayedByCity.show(10)



+------------------+----------------+------------------+------------------+
|    OriginCityName|sumDelayedByCity|totalFlightsByCity|      delayedRatio|
+------------------+----------------+------------------+------------------+
|   Gainesville, FL|            2847|             16524| 17.22948438634713|
|      Richmond, VA|           19749|             96878|20.385433225293667|
|       Ontario, CA|           15887|             91776|17.310625871687588|
|     Pago Pago, TT|             113|               299| 37.79264214046823|
|        Tucson, AZ|           11284|             70842|15.928404054092205|
|  Myrtle Beach, SC|           10761|             56044|19.200984940403966|
|       Medford, OR|            5562|             34750| 16.00575539568345|
|  Palm Springs, CA|            9311|             52409|17.766032551660974|
|       Durango, CO|            3502|             16636|21.050733349362826|
|Corpus Christi, TX|            3734|             22140|16.865401987353206|
+-----------

                                                                                

In [None]:
#4 delayed flights
delayedFlights = delayDF.groupBy("ifDelayed").count()
delayedFlights.show() #1 - Delayed, 0 - Not delayed

In [67]:
#5 cancelled flights
cancelledFlights = combinedFlightsDF.groupBy("Cancelled").count()
cancelledFlights.show()



+---------+--------+
|Cancelled|   count|
+---------+--------+
|     true|  777267|
|    false|28416533|
+---------+--------+



                                                                                