In [81]:
import pyspark
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyspark.sql import functions as func
from pyspark.sql.types import StringType,FloatType
import nltk
from pyspark.sql import SparkSession


In [14]:
# reading in flight data csv
spark = SparkSession.builder.master('local[*]').appName('FlightAnalysis').getOrCreate()
combinedFlightsDF = spark.read.options(header = True, inferSchema = True).csv("FlightData")

                                                                                

In [15]:
# drops unnecessary columns
drop_cols = ("FlightDate","Origin","Dest","Diverted","DepDelayMinutes","ArrDelayMinutes","IATA_Code_Operating_Airline","DestAirportSeqID","DestCityMarketID","DOT_ID_Operating_Airline","DOT_ID_Marketing_Airline","IATA_Code_Marketing_Airline","Operated_or_Branded_Code_Share_Partners","Cancelled","CRSDepTime","DepDelay","AirTime","CRSElapsedTime", "Distance", "Quarter", "DayOfWeek", "Marketing_Airline_Network", "Flight_Number_Marketing_Airline", "Operating_Airline", "Tail_Number", "Flight_Number_Operating_Airline", "OriginAirportID",  "OriginAirportSeqID", "OriginCityMarketID",  "OriginState",  "OriginStateFips",  "OriginStateName", "OriginWac", "DestAirportID", "DestState", "DestStateFips", "DestStateName", "DestWac", "DepartureDelayGroups", "DepTimeBlk", "TaxiOut", "WheelsOff", "WheelsOn", "TaxiIn", "CRSArrTime", "ArrDelay", "ArrivalDelayGroups", "ArrTimeBlk", "DistanceGroup", "DivAirportLandings")
combinedFlightsDF = combinedFlightsDF.drop(*drop_cols)

In [16]:
combinedFlightsDF.show(5)

+---------+-------+-------+-----------------+----+-----+----------+---------------+------------+--------+--------+
|  Airline|DepTime|ArrTime|ActualElapsedTime|Year|Month|DayofMonth| OriginCityName|DestCityName|DepDel15|ArrDel15|
+---------+-------+-------+-----------------+----+-----+----------+---------------+------------+--------+--------+
|Envoy Air| 1209.0| 1350.0|            101.0|2019|    4|         1|Little Rock, AR| Chicago, IL|     0.0|     0.0|
|Envoy Air| 1200.0| 1348.0|            108.0|2019|    4|         2|Little Rock, AR| Chicago, IL|     0.0|     0.0|
|Envoy Air| 1203.0| 1342.0|             99.0|2019|    4|         3|Little Rock, AR| Chicago, IL|     0.0|     0.0|
|Envoy Air| 1435.0| 1621.0|            106.0|2019|    4|         4|Little Rock, AR| Chicago, IL|     1.0|     1.0|
|Envoy Air| 1216.0| 1410.0|            114.0|2019|    4|         5|Little Rock, AR| Chicago, IL|     0.0|     0.0|
+---------+-------+-------+-----------------+----+-----+----------+-------------

In [5]:
#Creates csv with specified data (dont run)
combinedFlightsDF.write.option("header",True).csv("combinedFlightsSearchBar")

                                                                                

In [17]:
# groups flights by year
yearCount = combinedFlightsDF.groupBy("Year").count()
yearCount.show()



+----+-------+
|Year|  count|
+----+-------+
|2019|8091684|
|2018|5689512|
|2022|4078318|
|2020|5022397|
|2021|6311871|
+----+-------+



                                                                                

In [86]:
#28 total airlines (listed in discord)
airlineRank = combinedFlightsDF.groupBy("Airline").count()

                                                                                

In [87]:
#total flights for each of the 28 airlines
airlineRank.show(30,truncate=False)



+-----------------------------------------+-------+
|Airline                                  |count  |
+-----------------------------------------+-------+
|Endeavor Air Inc.                        |998224 |
|United Air Lines Inc.                    |2354538|
|Compass Airlines                         |154985 |
|Comair Inc.                              |957220 |
|Southwest Airlines Co.                   |5474339|
|ExpressJet Airlines Inc.                 |353669 |
|JetBlue Airways                          |1106079|
|Empire Airlines Inc.                     |23122  |
|Envoy Air                                |1072778|
|Capital Cargo International              |392011 |
|Hawaiian Airlines Inc.                   |310782 |
|Mesa Airlines Inc.                       |749216 |
|American Airlines Inc.                   |3134117|
|Republic Airlines                        |1283704|
|Spirit Air Lines                         |836694 |
|GoJet Airlines, LLC d/b/a United Express |276486 |
|Allegiant A

                                                                                

In [18]:
# creating new df to analyze delayed flights by airline and month/year
drop_cols = ("OriginCityName", "DestCityName", "DayOfMonth", "DepTime", "ArrTime", "ActualElapsedTime")
mostDelayAirline = combinedFlightsDF.drop(*drop_cols)
mostDelayAirline.show(5)

+---------+----+-----+--------+--------+
|  Airline|Year|Month|DepDel15|ArrDel15|
+---------+----+-----+--------+--------+
|Envoy Air|2019|    4|     0.0|     0.0|
|Envoy Air|2019|    4|     0.0|     0.0|
|Envoy Air|2019|    4|     0.0|     0.0|
|Envoy Air|2019|    4|     1.0|     1.0|
|Envoy Air|2019|    4|     0.0|     0.0|
+---------+----+-----+--------+--------+
only showing top 5 rows



In [20]:
mostDelayAirline.columns

['Airline', 'Year', 'Month', 'DepDel15', 'ArrDel15']

In [45]:
# uses bitwise or operator to find overall delayed flights
mostDelayAirline = mostDelayAirline.withColumn("ifDelayed", func.when((mostDelayAirline.ArrDel15 > 0) | (mostDelayAirline.DepDel15 > 0), 1).otherwise(0))
mostDelayAirline.show(10)

+---------+----+-----+--------+--------+---------+
|  Airline|Year|Month|DepDel15|ArrDel15|ifDelayed|
+---------+----+-----+--------+--------+---------+
|Envoy Air|2019|    4|     0.0|     0.0|        0|
|Envoy Air|2019|    4|     0.0|     0.0|        0|
|Envoy Air|2019|    4|     0.0|     0.0|        0|
|Envoy Air|2019|    4|     1.0|     1.0|        1|
|Envoy Air|2019|    4|     0.0|     0.0|        0|
|Envoy Air|2019|    4|     1.0|     1.0|        1|
|Envoy Air|2019|    4|     0.0|     0.0|        0|
|Envoy Air|2019|    4|     0.0|     0.0|        0|
|Envoy Air|2019|    4|     0.0|     0.0|        0|
|Envoy Air|2019|    4|     0.0|     0.0|        0|
+---------+----+-----+--------+--------+---------+
only showing top 10 rows



In [71]:
# groups delayed flights by airline
numDelayed = mostDelayAirline.groupBy("Airline").agg({"ifDelayed" : "sum"})
numDelayed = numDelayed.withColumnRenamed("sum(ifDelayed)", "sumDelayed")
numDelayed.show(30,truncate=False)

# to check if delayed flight per airline add up to original amount
# sumdf = numDelayed.agg({"sum(ifDelayed)" : "sum"})
# sumdf.show()



+-----------------------------------------+----------+
|Airline                                  |sumDelayed|
+-----------------------------------------+----------+
|Endeavor Air Inc.                        |147358    |
|United Air Lines Inc.                    |510700    |
|Compass Airlines                         |31705     |
|Comair Inc.                              |188171    |
|Southwest Airlines Co.                   |1269581   |
|ExpressJet Airlines Inc.                 |79802     |
|JetBlue Airways                          |325465    |
|Empire Airlines Inc.                     |3665      |
|Envoy Air                                |205502    |
|Capital Cargo International              |68581     |
|Hawaiian Airlines Inc.                   |39344     |
|Mesa Airlines Inc.                       |155645    |
|American Airlines Inc.                   |678132    |
|Republic Airlines                        |232202    |
|Spirit Air Lines                         |185300    |
|GoJet Air

                                                                                

In [100]:
# add a new column to show ratio of delayed flights for each airline
numDelayed = numDelayed\
    .join(airlineRank, "Airline")\
    .withColumn("delayedRatio", ((func.col("sumDelayed") / func.col("count")) * 100))\
    .drop("count")

AttributeError: 'DataFrame' object has no attribute 'col'

In [None]:
numDelayed.show(10)