In [2]:
# Obtain departure Delays data
delay = spark.read.csv("s3n://csed516/Flights/flights.csv", header="true", inferSchema="true")
delay.registerTempTable("departureDelays")
delay.cache()
print(delay.columns)

# Obtain airports dataset
airports = spark.read.csv("s3n://csed516/Flights/airports.csv", header="true", inferSchema="true")
airports.registerTempTable("airports")
print(airports.columns)

['FL_DATE', 'CARRIER', 'ORIGIN', 'ORIGIN_CITY_NAME', 'ORIGIN_STATE_ABR', 'DEST', 'DEST_CITY_NAME', 'DEST_STATE_ABR', 'DEP_TIME', 'DEP_DELAY', 'ARR_DELAY']
['IATA_CODE', 'AIRPORT', 'CITY', 'STATE', 'COUNTRY', 'LATITUDE', 'LONGITUDE']


In [3]:
# create trip IATA codes table
tripIATA = sqlContext.sql("""
  SELECT DISTINCT IATA FROM(
    SELECT DISTINCT ORIGIN AS iata FROM departureDelays 
    UNION ALL SELECT DISTINCT DEST AS iata FROM departureDelays) x""")
tripIATA.registerTempTable("tripIATA")

# Merge airport data with tripIATA data
airports = sqlContext.sql("""
   SELECT IATA_CODE AS IATA, City, State, Country 
   FROM airports
   JOIN tripIATA trip ON trip.IATA = airports.IATA_CODE""")
airports.registerTempTable("airports")
airports.cache()

# Build `departureDelays_geo` DataFrame
# Obtain key attributes such as Date of flight, delays, distance,
# and airport information (Origin, Destination)
departureDelays_geo = sqlContext.sql("""
    SELECT CAST(f.FL_DATE as int) as tripid,
           CAST(CONCAT(CONCAT(CONCAT(CONCAT(CONCAT(CONCAT('2017-', CONCAT(CONCAT(SUBSTR(CAST(f.FL_DATE as string), 1, 2), '-')), SUBSTR(CAST(f.FL_DATE AS string), 3, 2)), ' '), SUBSTR(CAST(f.FL_DATE AS string), 5, 2)), ':'), SUBSTR(CAST(f.FL_DATE AS string), 7, 2)), ':00') as timestamp) as localdate,
           CAST(f.DEP_DELAY as int) as delay, 
           f.ORIGIN AS src, f.DEST AS dst, o.city AS city_src,
           d.city AS city_dst, 
           o.state AS state_src,
           d.state AS state_dst
    FROM departureDelays f 
    JOIN airports o 
        ON o.IATA = f.ORIGIN 
    JOIN airports d 
        ON d.IATA = f.DEST""")
departureDelays_geo.registerTempTable("departureDelays_geo")

# Cache and Count
departureDelays_geo.cache()
departureDelays_geo.count()

1526121

In [4]:
from pyspark.sql.functions import *
from graphframes import *

tripVertices = airports.withColumnRenamed("IATA", "id").distinct()
tripEdges = departureDelays_geo.select("tripid", "delay", "src", "dst", "city_dst", "state_dst")

# Cache the graph
tripEdges.cache()
tripVertices.cache()

# Examine the vertices and edges
tripVertices.show()
tripEdges.show()

# Build the graph
tripGraph = GraphFrame(tripVertices, tripEdges)
print tripGraph

# Project columns to build the smaller datastructure
tripEdgesPrime = departureDelays_geo.select("tripid", "delay", "src", "dst")
tripGraphPrime = GraphFrame(tripVertices, tripEdgesPrime)

+---+----------------+-----+-------+
| id|            City|State|Country|
+---+----------------+-----+-------+
|FAT|          Fresno|   CA|    USA|
|CMH|        Columbus|   OH|    USA|
|PHX|         Phoenix|   AZ|    USA|
|ITH|          Ithaca|   NY|    USA|
|PAH|         Paducah|   KY|    USA|
|PSE|           Ponce|   PR|    USA|
|COS|Colorado Springs|   CO|    USA|
|RNO|            Reno|   NV|    USA|
|MYR|    Myrtle Beach|   SC|    USA|
|JMS|       Jamestown|   ND|    USA|
|SRQ|        Sarasota|   FL|    USA|
|VLD|        Valdosta|   GA|    USA|
|PSC|           Pasco|   WA|    USA|
|CAE|        Columbia|   SC|    USA|
|LAX|     Los Angeles|   CA|    USA|
|DAY|          Dayton|   OH|    USA|
|MFR|         Medford|   OR|    USA|
|JFK|        New York|   NY|    USA|
|LAS|       Las Vegas|   NV|    USA|
|BNA|       Nashville|   TN|    USA|
+---+----------------+-----+-------+
only showing top 20 rows

+----------+-----+---+---+----------+---------+
|    tripid|delay|src|dst|  city_dst|s

In [5]:
#number of vertices and edges?
print "Airports: %d" % tripGraph.vertices.count()
print "Trips: %d" % tripGraph.edges.count()

Airports: 299
Trips: 1526121


In [6]:
#Finding the longest Delay
longestDelay = tripGraph.edges.groupBy().max("delay")
longestDelay.show()
print "On-time / Early Flights: %d" % tripGraph.edges.filter("delay <= 0").count()
print "Delayed Flights: %d" % tripGraph.edges.filter("delay > 0").count()

+----------+
|max(delay)|
+----------+
|      1916|
+----------+

On-time / Early Flights: 919041
Delayed Flights: 590844


In [7]:
# Degrees
tripGraph.degrees.sort(desc("degree")).show()

+---+------+
| id|degree|
+---+------+
|ATL|193125|
|ORD|153606|
|DEN|123894|
|LAX|115020|
|DFW| 96462|
|SFO| 94164|
|SEA| 80352|
|LAS| 79362|
|PHX| 79056|
|MSP| 75747|
|MCO| 71070|
|IAH| 69042|
|BOS| 66765|
|DTW| 65658|
|SLC| 60012|
|EWR| 59940|
|CLT| 58299|
|BWI| 55560|
|JFK| 50793|
|MDW| 48510|
+---+------+
only showing top 20 rows



In [9]:
tripGraph.edges.filter("src = 'SFO' and delay > 0") \
               .groupBy("src", "dst") \
               .avg("delay") \
               .sort(desc("avg(delay)")) \
               .show()

+---+---+------------------+
|src|dst|        avg(delay)|
+---+---+------------------+
|SFO|JAC|           154.625|
|SFO|COS| 92.71428571428571|
|SFO|FAT| 91.11764705882354|
|SFO|FCA| 88.82352941176471|
|SFO|ABQ| 78.86666666666666|
|SFO|SBP|            71.125|
|SFO|ONT| 70.76923076923077|
|SFO|SAT|              68.0|
|SFO|ACV| 65.47826086956522|
|SFO|PSC|            58.625|
|SFO|OTH| 54.77777777777778|
|SFO|MSY| 54.57142857142857|
|SFO|TUS| 52.11538461538461|
|SFO|SUN|            51.375|
|SFO|LGB| 50.92063492063492|
|SFO|ORD| 50.45514950166113|
|SFO|RDD|              49.8|
|SFO|BFL|48.416666666666664|
|SFO|SMF| 48.04081632653061|
|SFO|EWR|47.767175572519086|
+---+---+------------------+
only showing top 20 rows



In [12]:
# States with the longest average delays 
# (with individual delays > 100 minutes) (origin: Seattle)
tripGraph.edges.filter("src = 'SEA' and delay > 100").groupBy('dst').avg('delay').show()

+---+------------------+
|dst|        avg(delay)|
+---+------------------+
|GEG| 280.6666666666667|
|BUR|             181.0|
|SNA|             137.0|
|EUG|             192.5|
|OAK|            157.75|
|DCA|             110.0|
|RDM|             425.0|
|KTN|             108.5|
|LIH|             124.0|
|IAH|182.57142857142858|
|HNL|             139.0|
|SJC|             188.0|
|LGB|             165.4|
|BOS|             153.5|
|EWR|133.33333333333334|
|LAS|           197.125|
|FAI|             127.0|
|DEN|             130.3|
|BOI|             223.0|
|IAD|             154.0|
+---+------------------+
only showing top 20 rows

