In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
#This is another way to extract the files from multiple types of compressed files.
#https://docs.python.org/3/library/shutil.html
import shutil
#shutil.unpack_archive('airLineCapitalOne.zip','newdata')
import zipfile

with zipfile.ZipFile('airLineCapitalOne.zip','r') as ref:
    ref.extractall('airLineData')

In [3]:
metaData = pd.read_excel("airLineData/Airline_Challenge_Metadata.xlsx")

In [4]:
flightsPath = "airLineData/Flights.csv"
ticketsPath = "airLineData/Tickets.csv"
AirPortCodesPath = "airLineData/Airport_Codes.csv"

In [5]:
#Starting the pyspark session
spark = SparkSession.builder.appName("CapitalOne DA").getOrCreate()

22/11/23 20:11:28 WARN Utils: Your hostname, codeStation resolves to a loopback address: 127.0.1.1; using 172.17.0.1 instead (on interface docker0)
22/11/23 20:11:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/23 20:11:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [8]:
#Creating the local database for storing the data
spark.sql("CREATE DATABASE IF NOT EXISTS capitalone_DA")
spark.sql("USE capitalone_DA")

DataFrame[]

In [19]:
#Reading the files 
flights= spark.read.csv(flightsPath,inferSchema=True,header=True)
tickets= spark.read.csv(ticketsPath, inferSchema=True, header=True)
airport= spark.read.csv(AirPortCodesPath, inferSchema=True, header=True)

                                                                                

In [15]:
#The tables can be read directly like below using read.parquet
airport = spark.read.parquet("spark-warehouse/capitalone_da.db/airport/")
tickets = spark.read.parquet("spark-warehouse/capitalone_da.db/tickets/")
flights = spark.read.parquet("spark-warehouse/capitalone_da.db/flights/")

In [21]:
# creating views in the database
flights.createOrReplaceTempView("flights")
tickets.createOrReplaceTempView("tickets")
airport.createOrReplaceTempView("airport")

In [31]:
#Savings tables to the database as permanents
flights.write.saveAsTable('flights')
airport.write.saveAsTable('airport')
tickets.write.saveAsTable('tickets')
#The second time the above commands are not required as the tables will be there.

                                                                                

In [17]:
flights.show(2)

+----------+----------+--------+-----------------+-----------------+------+----------------+---------------+-----------+-----------------+---------+---------+---------+--------+--------+--------------+
|   FL_DATE|OP_CARRIER|TAIL_NUM|OP_CARRIER_FL_NUM|ORIGIN_AIRPORT_ID|ORIGIN|ORIGIN_CITY_NAME|DEST_AIRPORT_ID|DESTINATION|   DEST_CITY_NAME|DEP_DELAY|ARR_DELAY|CANCELLED|AIR_TIME|DISTANCE|OCCUPANCY_RATE|
+----------+----------+--------+-----------------+-----------------+------+----------------+---------------+-----------+-----------------+---------+---------+---------+--------+--------+--------------+
|2019-03-31|        UA|  N839UA|             1485|            13930|   ORD|     Chicago, IL|          11066|        CMH|     Columbus, OH|     -2.0|      2.0|      0.0|    46.0|   296.0|          0.46|
|2019-03-31|        UA|  N590UA|             1483|            11618|   EWR|      Newark, NJ|          14771|        SFO|San Francisco, CA|     -4.0|    -11.0|      0.0|   338.0|  2565.0|      

In [22]:
#Temporary tables of the above data is created in the spark execution environment
spark.sql("SHOW TABLES").show()

+---------+---------------+-----------+
|namespace|      tableName|isTemporary|
+---------+---------------+-----------+
|         |        airport|       true|
|         |flightcondensed|       true|
|         |        flights|       true|
|         |        tickets|       true|
+---------+---------------+-----------+



In [18]:
flights.printSchema()

root
 |-- FL_DATE: string (nullable = true)
 |-- OP_CARRIER: string (nullable = true)
 |-- TAIL_NUM: string (nullable = true)
 |-- OP_CARRIER_FL_NUM: string (nullable = true)
 |-- ORIGIN_AIRPORT_ID: integer (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- ORIGIN_CITY_NAME: string (nullable = true)
 |-- DEST_AIRPORT_ID: integer (nullable = true)
 |-- DESTINATION: string (nullable = true)
 |-- DEST_CITY_NAME: string (nullable = true)
 |-- DEP_DELAY: double (nullable = true)
 |-- ARR_DELAY: double (nullable = true)
 |-- CANCELLED: double (nullable = true)
 |-- AIR_TIME: string (nullable = true)
 |-- DISTANCE: string (nullable = true)
 |-- OCCUPANCY_RATE: double (nullable = true)



In [24]:
airport.printSchema()

root
 |-- TYPE: string (nullable = true)
 |-- NAME: string (nullable = true)
 |-- ELEVATION_FT: double (nullable = true)
 |-- CONTINENT: string (nullable = true)
 |-- ISO_COUNTRY: string (nullable = true)
 |-- MUNICIPALITY: string (nullable = true)
 |-- IATA_CODE: string (nullable = true)
 |-- COORDINATES: string (nullable = true)



In [26]:
tickets.printSchema()

root
 |-- ITIN_ID: long (nullable = true)
 |-- YEAR: integer (nullable = true)
 |-- QUARTER: integer (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- ORIGIN_COUNTRY: string (nullable = true)
 |-- ORIGIN_STATE_ABR: string (nullable = true)
 |-- ORIGIN_STATE_NM: string (nullable = true)
 |-- ROUNDTRIP: double (nullable = true)
 |-- REPORTING_CARRIER: string (nullable = true)
 |-- PASSENGERS: double (nullable = true)
 |-- ITIN_FARE: string (nullable = true)
 |-- DESTINATION: string (nullable = true)



In [19]:
#The df.sql function APIs are helpful 
flights_condensed = flights.select(date_format(col('FL_DATE'),'y-MM-dd').alias('flight_date'),
                'OP_CARRIER','TAIL_NUM','OP_CARRIER_FL_NUM','ORIGIN','DESTINATION',
               'DEP_DELAY','ARR_DELAY','CANCELLED','AIR_TIME','DISTANCE',
               'OCCUPANCY_RATE')
flights_condensed.createOrReplaceTempView("flightCondensed")

In [44]:
spark.sql("""SELECT * FROM flightCondensed LIMIT 5""").show()

+-----------+----------+--------+-----------------+------+-----------+---------+---------+---------+--------+--------+--------------+
|flight_date|OP_CARRIER|TAIL_NUM|OP_CARRIER_FL_NUM|ORIGIN|DESTINATION|DEP_DELAY|ARR_DELAY|CANCELLED|AIR_TIME|DISTANCE|OCCUPANCY_RATE|
+-----------+----------+--------+-----------------+------+-----------+---------+---------+---------+--------+--------+--------------+
| 2019-03-02|        WN|  N955WN|             4591|   RSW|        CLE|     -8.0|     -6.0|      0.0|   143.0|  1025.0|          0.97|
| 2019-03-02|        WN|  N8686A|             3231|   RSW|        CMH|      1.0|      5.0|      0.0|   135.0|   930.0|          0.55|
| 2019-03-02|        WN|  N201LV|             3383|   RSW|        CMH|      0.0|      4.0|      0.0|   132.0|   930.0|          0.91|
| 2019-03-02|        WN|  N413WN|             5498|   RSW|        CMH|     11.0|     14.0|      0.0|   136.0|   930.0|          0.67|
| 2019-03-02|        WN|  N7832A|             6933|   RSW|    

In [25]:
airportCondensed = spark.sql("""SELECT TYPE, NAME, ELEVATION_FT, MUNICIPALITY, IATA_CODE, COORDINATES FROM airport""")
airportCondensed.createOrReplaceTempView("airportCondensed")

In [26]:
spark.sql("""SELECT * FROM airportCondensed""").show(5)

+-------------+--------------------+------------+------------+---------+--------------------+
|         TYPE|                NAME|ELEVATION_FT|MUNICIPALITY|IATA_CODE|         COORDINATES|
+-------------+--------------------+------------+------------+---------+--------------------+
|     heliport|   Total Rf Heliport|        11.0|    Bensalem|     null|-74.9336013793945...|
|small_airport|Aero B Ranch Airport|      3435.0|       Leoti|     null|-101.473911, 38.7...|
|small_airport|        Lowell Field|       450.0|Anchor Point|     null|-151.695999146, 5...|
|small_airport|        Epps Airpark|       820.0|     Harvest|     null|-86.7703018188476...|
|       closed|Newport Hospital ...|       237.0|     Newport|     null| -91.254898, 35.6087|
+-------------+--------------------+------------+------------+---------+--------------------+
only showing top 5 rows



In [28]:
ticketsCondensed = spark.sql("""SELECT ITIN_ID, YEAR, QUARTER, ORIGIN, ORIGIN_STATE_ABR, DESTINATION,
              ROUNDTRIP, REPORTING_CARRIER, PASSENGERS, ITIN_FARE FROM tickets""")
ticketsCondensed.createOrReplaceTempView("ticketsCondensed")

Analytics questions on the Airline data
0) On each date, how many flights are there
1) How many unique flights are there in the datasets
2) Which flights have travelled the longest distance
3) Which routes has the maximum flights
4) Which routes has the maximum number of round trips
5) What is the highest elevation of the airport
6) Which route has the maximum revenue
7) Which route has the maximum number of passenger
8) What is the number of passenger carried by the aircraft till date

In [33]:
#What is the number of flights operating on each date

spark.sql("""SELECT flight_date, COUNT(flight_date) AS flight_counts 
            FROM flightCondensed
            GROUP BY flight_date
            ORDER BY COUNT(flight_date) DESC""").show(5)



+-----------+-------------+
|flight_date|flight_counts|
+-----------+-------------+
| 2019-03-15|        23361|
| 2019-03-14|        23305|
| 2019-03-22|        23258|
| 2019-03-29|        23250|
| 2019-03-11|        23249|
+-----------+-------------+
only showing top 5 rows



                                                                                

In [39]:
# How many unique flights are there in the datasets
spark.sql("""SELECT TAIL_NUM, 
                COUNT(TAIL_NUM) AS tripCounts
                FROM flightCondensed
                GROUP BY TAIL_NUM
                ORDER BY COUNT(TAIL_NUM) DESC""").show(5)

+--------+----------+
|TAIL_NUM|tripCounts|
+--------+----------+
|  N485HA|       928|
|  N479HA|       882|
|  N483HA|       882|
|  N480HA|       875|
|  N491HA|       873|
+--------+----------+
only showing top 5 rows



In [42]:
# How many trips each operator has done
spark.sql("""SELECT OP_CARRIER,
                COUNT(TAIL_NUM) AS tripCounts
                FROM flightCondensed
                GROUP BY OP_CARRIER
                ORDER BY COUNT(TAIL_NUM) DESC""").show(5)

+----------+----------+
|OP_CARRIER|tripCounts|
+----------+----------+
|        WN|    326093|
|        AA|    232403|
|        DL|    225391|
|        OO|    194934|
|        UA|    142826|
+----------+----------+
only showing top 5 rows



In [48]:
#How many different carriers are there in the datasets?
spark.sql("""SELECT OP_CARRIER,
                COUNT(TAIL_NUM) AS tripCounts
                FROM flightCondensed
                GROUP BY OP_CARRIER
                ORDER BY COUNT(TAIL_NUM) DESC""").count()

26

In [47]:
# How many flight numbers are there under each flight numbers
spark.sql("""SELECT DISTINCT TAIL_NUM
              FROM flightCondensed
              WHERE OP_CARRIER = 'WN'""").show()

+--------+
|TAIL_NUM|
+--------+
|  N8554X|
|  N8322X|
|  N954WN|
|  N8513F|
|  N445WN|
|  N8525S|
|  N914WN|
|  N8511K|
|  N251WN|
|  N496WN|
|  N8734Q|
|  N448WN|
|  N240WN|
|  N430WN|
|  N8634A|
|  N757LV|
|  N7838A|
|  N7715E|
|  N8679A|
|  N404WN|
+--------+
only showing top 20 rows



In [47]:
# How many flight numbers are there under each flight numbers
spark.sql("""SELECT DISTINCT TAIL_NUM
              FROM flightCondensed
              WHERE OP_CARRIER = 'WN'""").show()

+--------+
|TAIL_NUM|
+--------+
|  N8554X|
|  N8322X|
|  N954WN|
|  N8513F|
|  N445WN|
|  N8525S|
|  N914WN|
|  N8511K|
|  N251WN|
|  N496WN|
|  N8734Q|
|  N448WN|
|  N240WN|
|  N430WN|
|  N8634A|
|  N757LV|
|  N7838A|
|  N7715E|
|  N8679A|
|  N404WN|
+--------+
only showing top 20 rows



In [49]:
#Which flight numbers are getting delayed frequently
spark.sql("""SELECT SUM(DEP_DELAY) AS TOTAL_DELAY, TAIL_NUM
            FROM flightCondensed
            GROUP BY TAIL_NUM
            ORDER BY SUM(DEP_DELAY) DESC""").show()



+-----------+--------+
|TOTAL_DELAY|TAIL_NUM|
+-----------+--------+
|    19178.0|  N927SW|
|    16269.0|  N786SK|
|    15404.0|  N14143|
|    15099.0|  N920SW|
|    15015.0|  N21197|
|    14694.0|  N908EV|
|    14103.0|  N11191|
|    14051.0|  N954SW|
|    13938.0|  N780SK|
|    13888.0|  N758EV|
|    13885.0|  N690CA|
|    13843.0|  N693BR|
|    13755.0|  N880AS|
|    13672.0|  N466SW|
|    13518.0|  N788SK|
|    13352.0|  N134SY|
|    13292.0|  N760SK|
|    13198.0|  N871AS|
|    13173.0|  N331CA|
|    13172.0|  N791SK|
+-----------+--------+
only showing top 20 rows



                                                                                

In [50]:
#What is the total departure delays of the Op_carriers
spark.sql("""SELECT SUM(DEP_DELAY) AS TOTAL_DELAY, OP_CARRIER
            FROM flightCondensed
            GROUP BY OP_CARRIER
            ORDER BY SUM(DEP_DELAY) DESC""").show()

+-----------+----------+
|TOTAL_DELAY|OP_CARRIER|
+-----------+----------+
|  3179490.0|        WN|
|  3014699.0|        OO|
|  2251096.0|        AA|
|  1785352.0|        DL|
|  1708696.0|        UA|
|  1308290.0|        B6|
|   686371.0|        MQ|
|   623717.0|        9E|
|   603639.0|        YX|
|   590527.0|        OH|
|   569127.0|        YV|
|   520857.0|        EV|
|   401598.0|        G7|
|   389617.0|        NK|
|   361070.0|        F9|
|   350897.0|        AX|
|   348663.0|        AS|
|   335448.0|        C5|
|   326916.0|        CP|
|   243367.0|        ZW|
+-----------+----------+
only showing top 20 rows



In [51]:
#Which flight numbers are getting delayed frequently
spark.sql("""SELECT SUM(ARR_DELAY) AS TOTAL_DELAY, TAIL_NUM
            FROM flightCondensed
            GROUP BY TAIL_NUM
            ORDER BY SUM(ARR_DELAY) DESC""").show()

+-----------+--------+
|TOTAL_DELAY|TAIL_NUM|
+-----------+--------+
|    17831.0|  N927SW|
|    15995.0|  N786SK|
|    14752.0|  N14143|
|    14664.0|  N21197|
|    14096.0|  N920SW|
|    13809.0|  N908EV|
|    13369.0|  N780SK|
|    13311.0|  N788SK|
|    13231.0|  N11191|
|    13060.0|  N693BR|
|    12944.0|  N954SW|
|    12920.0|  N880AS|
|    12713.0|  N758EV|
|    12712.0|  N760SK|
|    12607.0|  N134SY|
|    12543.0|  N466SW|
|    12371.0|  N18120|
|    12348.0|  N791SK|
|    12254.0|  N782SK|
|    12130.0|  N14204|
+-----------+--------+
only showing top 20 rows



In [52]:
#What is the total departure delays of the Op_carriers
spark.sql("""SELECT SUM(ARR_DELAY) AS TOTAL_DELAY, OP_CARRIER
            FROM flightCondensed
            GROUP BY OP_CARRIER
            ORDER BY SUM(ARR_DELAY) DESC""").show()

+-----------+----------+
|TOTAL_DELAY|OP_CARRIER|
+-----------+----------+
|  2173234.0|        OO|
|  1228558.0|        AA|
|  1017167.0|        UA|
|   893889.0|        B6|
|   750513.0|        WN|
|   559109.0|        MQ|
|   457229.0|        EV|
|   454123.0|        YV|
|   354137.0|        YX|
|   313872.0|        C5|
|   311863.0|        AX|
|   275975.0|        G7|
|   254221.0|        OH|
|   242416.0|        CP|
|   205151.0|        AS|
|   191928.0|        9E|
|   177492.0|        F9|
|   175002.0|        QX|
|   157844.0|        ZW|
|   147277.0|        G4|
+-----------+----------+
only showing top 20 rows



In [60]:
#How is the departure and arrival delays on each date?
spark.sql("""SELECT flight_date, SUM(DEP_DELAY) AS DEP_TOTAL_DELAY,
                    SUM(ARR_DELAY) AS ARR_TOTAL_DELAY
                    FROM flightCondensed
                    GROUP BY flight_date
                    ORDER BY flight_date""").tail(10)

                                                                                

[Row(flight_date='2019-03-22', DEP_TOTAL_DELAY=258541.0, ARR_TOTAL_DELAY=133729.0),
 Row(flight_date='2019-03-23', DEP_TOTAL_DELAY=116769.0, ARR_TOTAL_DELAY=-34271.0),
 Row(flight_date='2019-03-24', DEP_TOTAL_DELAY=144381.0, ARR_TOTAL_DELAY=-27920.0),
 Row(flight_date='2019-03-25', DEP_TOTAL_DELAY=183577.0, ARR_TOTAL_DELAY=47952.0),
 Row(flight_date='2019-03-26', DEP_TOTAL_DELAY=75968.0, ARR_TOTAL_DELAY=-76875.0),
 Row(flight_date='2019-03-27', DEP_TOTAL_DELAY=117087.0, ARR_TOTAL_DELAY=-39843.0),
 Row(flight_date='2019-03-28', DEP_TOTAL_DELAY=95720.0, ARR_TOTAL_DELAY=-69509.0),
 Row(flight_date='2019-03-29', DEP_TOTAL_DELAY=103217.0, ARR_TOTAL_DELAY=-57336.0),
 Row(flight_date='2019-03-30', DEP_TOTAL_DELAY=69542.0, ARR_TOTAL_DELAY=-74062.0),
 Row(flight_date='2019-03-31', DEP_TOTAL_DELAY=135289.0, ARR_TOTAL_DELAY=-3327.0)]

In [61]:
spark.sql("""SELECT flight_date, SUM(DEP_DELAY) AS DEP_TOTAL_DELAY,
                    SUM(ARR_DELAY) AS ARR_TOTAL_DELAY
                    FROM flightCondensed
                    GROUP BY flight_date
                    ORDER BY flight_date""").head(10)

                                                                                

[Row(flight_date=None, DEP_TOTAL_DELAY=30560.0, ARR_TOTAL_DELAY=3950.0),
 Row(flight_date='2019-01-01', DEP_TOTAL_DELAY=232562.0, ARR_TOTAL_DELAY=128217.0),
 Row(flight_date='2019-01-02', DEP_TOTAL_DELAY=291099.0, ARR_TOTAL_DELAY=193013.0),
 Row(flight_date='2019-01-03', DEP_TOTAL_DELAY=193078.0, ARR_TOTAL_DELAY=56174.0),
 Row(flight_date='2019-01-04', DEP_TOTAL_DELAY=110566.0, ARR_TOTAL_DELAY=-52259.0),
 Row(flight_date='2019-01-05', DEP_TOTAL_DELAY=159352.0, ARR_TOTAL_DELAY=30358.0),
 Row(flight_date='2019-01-06', DEP_TOTAL_DELAY=259492.0, ARR_TOTAL_DELAY=120118.0),
 Row(flight_date='2019-01-07', DEP_TOTAL_DELAY=135680.0, ARR_TOTAL_DELAY=-5475.0),
 Row(flight_date='2019-01-08', DEP_TOTAL_DELAY=65511.0, ARR_TOTAL_DELAY=-82616.0),
 Row(flight_date='2019-01-09', DEP_TOTAL_DELAY=45445.0, ARR_TOTAL_DELAY=-92416.0)]

In [73]:
#On each date, what is the delay of each op_carrier
spark.sql("""SELECT flight_date, OP_CARRIER,DEP_DELAY,TAIL_NUM
                FROM flightCondensed
                WHERE flight_date = '2019-01-08'
                ORDER BY DEP_DELAY DESC""").show()

[Stage 108:>                                                        (0 + 4) / 4]

+-----------+----------+---------+--------+
|flight_date|OP_CARRIER|DEP_DELAY|TAIL_NUM|
+-----------+----------+---------+--------+
| 2019-01-08|        AA|   1281.0|  N905NN|
| 2019-01-08|        OO|   1014.0|  N805SK|
| 2019-01-08|        OO|   1010.0|  N805SK|
| 2019-01-08|        AA|    837.0|  N315RJ|
| 2019-01-08|        G7|    833.0|  N153GJ|
| 2019-01-08|        AX|    829.0|  N11155|
| 2019-01-08|        EV|    683.0|  N14905|
| 2019-01-08|        DL|    655.0|  N955AT|
| 2019-01-08|        9E|    654.0|  N176PQ|
| 2019-01-08|        OO|    631.0|  N703SK|
| 2019-01-08|        C5|    629.0|  N16183|
| 2019-01-08|        AA|    611.0|  N967AN|
| 2019-01-08|        G7|    599.0|  N369CA|
| 2019-01-08|        OO|    531.0|  N603SK|
| 2019-01-08|        OO|    506.0|  N873AS|
| 2019-01-08|        C5|    492.0|  N27190|
| 2019-01-08|        YV|    443.0|  N507MJ|
| 2019-01-08|        OO|    438.0|  N906SW|
| 2019-01-08|        C5|    425.0|  N12195|
| 2019-01-08|        UA|    412.

