## Documentation: https://spark.apache.org/docs/latest/api/python/pyspark.sql.html

Initilizing SQLContext

In [52]:
import pyspark
!rm -rf metastore_db/
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

To use RDD in SQLContext, RDD lines have to be converted into a Row format

In [53]:
import re
from pyspark.sql import Row
# Read from CSV
def load_csv(line):
    return re.split("\"?,\"?", line)
        
def readInt(x):
    try:
        return int(x)
    except:
        return x

def parseElement(e):
    return Row( year=readInt(e[0]),
                month=readInt(e[1]),
                day=readInt(e[2]),
                dep_time=readInt(e[3]),
                dep_delay=readInt(e[4]),
                arr_time=readInt(e[5]),
                arr_delay=readInt(e[6]),
                cancelled=e[7],
                carrier=e[8],
                tailnum=e[9],
                flight=readInt(e[10]),
                origin=e[11],
                dest=e[12],
                air_time=readInt(e[13]),
                distance=readInt(e[14]),
                hour=readInt(e[15]),
                min=readInt(e[16])
    )

In [6]:
!wget -P /tmp https://dsr-data.s3.amazonaws.com/flights/flights14.csv
#!hadoop fs -put /tmp/flights14.csv /tmp

--2017-01-25 11:14:36--  https://dsr-data.s3.amazonaws.com/flights/flights14.csv
Resolving dsr-data.s3.amazonaws.com (dsr-data.s3.amazonaws.com)... 52.219.73.10
Connecting to dsr-data.s3.amazonaws.com (dsr-data.s3.amazonaws.com)|52.219.73.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16150465 (15M) [text/csv]
Saving to: ‘/tmp/flights14.csv.1’


2017-01-25 11:14:39 (5.32 MB/s) - ‘/tmp/flights14.csv.1’ saved [16150465/16150465]



In [54]:
flights = sc.textFile("/tmp/flights14.csv").map(load_csv).filter(lambda e: not e[0] == "year").map(parseElement).cache()

Now the schema and the RDD have to be registered with the sqlContext:

In [55]:
flightsDF = sqlContext.createDataFrame(flights)

In [56]:
flightsDF.registerTempTable("flight")

In [57]:
sqlContext.sql("select * from flight where dest = 'LAX'").take(5)

[Row(air_time=359, arr_delay=13, arr_time=1238, cancelled='0', carrier='AA', day=1, dep_delay=14, dep_time=914, dest='LAX', distance=2475, flight=1, hour=9, min=14, month=1, origin='JFK', tailnum='N338AA', year=2014),
 Row(air_time=363, arr_delay=13, arr_time=1523, cancelled='0', carrier='AA', day=1, dep_delay=-3, dep_time=1157, dest='LAX', distance=2475, flight=3, hour=11, min=57, month=1, origin='JFK', tailnum='N335AA', year=2014),
 Row(air_time=351, arr_delay=9, arr_time=2224, cancelled='0', carrier='AA', day=1, dep_delay=2, dep_time=1902, dest='LAX', distance=2475, flight=21, hour=19, min=2, month=1, origin='JFK', tailnum='N327AA', year=2014),
 Row(air_time=350, arr_delay=1, arr_time=1706, cancelled='0', carrier='AA', day=1, dep_delay=2, dep_time=1347, dest='LAX', distance=2475, flight=117, hour=13, min=47, month=1, origin='JFK', tailnum='N319AA', year=2014),
 Row(air_time=339, arr_delay=0, arr_time=2145, cancelled='0', carrier='AA', day=1, dep_delay=4, dep_time=1824, dest='LAX', d

In [58]:
flightsDF.where("origin = 'JFK' AND dest = 'MIA'").count()

2750

In [59]:
flightsDF.where("origin = 'JFK' AND dest = 'MIA'").limit(2).collect()

[Row(air_time=161, arr_delay=-17, arr_time=1828, cancelled='0', carrier='AA', day=1, dep_delay=-1, dep_time=1509, dest='MIA', distance=1089, flight=145, hour=15, min=9, month=1, origin='JFK', tailnum='N5FJAA', year=2014),
 Row(air_time=166, arr_delay=-8, arr_time=1227, cancelled='0', carrier='AA', day=1, dep_delay=7, dep_time=917, dest='MIA', distance=1089, flight=1085, hour=9, min=17, month=1, origin='JFK', tailnum='N5DWAA', year=2014)]

### Count the flights that departed early and arrived late

In [60]:
flightsDF.where("dep_delay>0 AND arr_delay>0").count()

72836

### Find the flightwith the longest arrival delay

In [74]:
sqlContext.sql("SELECT MAX(arr_delay) FROM flight").take(1)
#sqlContext.sql("SELECT * FROM flight HAVING arr_delay = MAX(arr_delay)").take(1)

[Row(max(arr_delay)=1494)]

### Find the top 10 destinations ordered by the number of flights

In [62]:
sqlContext.sql("select dest,count(dest) from flight group by dest order by count(dest) DESC").take(10)

[Row(dest='LAX', count(dest)=14434),
 Row(dest='ATL', count(dest)=12808),
 Row(dest='SFO', count(dest)=11907),
 Row(dest='MCO', count(dest)=11709),
 Row(dest='BOS', count(dest)=11609),
 Row(dest='ORD', count(dest)=11589),
 Row(dest='MIA', count(dest)=9928),
 Row(dest='CLT', count(dest)=9624),
 Row(dest='FLL', count(dest)=9471),
 Row(dest='DCA', count(dest)=6748)]

### Find top 10 destinations with the worst avg arrival delay, ignoring flights that arrived early

In [51]:
sqlContext.sql("select dest,avg(arr_delay) from flight where arr_delay >= 0 group by dest order by avg(arr_delay) DESC").take(10)

[Row(dest='EGE', avg(arr_delay)=78.03174603174604),
 Row(dest='AVP', avg(arr_delay)=67.0),
 Row(dest='CAK', avg(arr_delay)=55.86141304347826),
 Row(dest='TUL', avg(arr_delay)=55.122950819672134),
 Row(dest='MSN', avg(arr_delay)=53.32692307692308),
 Row(dest='BGR', avg(arr_delay)=52.28813559322034),
 Row(dest='OKC', avg(arr_delay)=51.3609022556391),
 Row(dest='IAD', avg(arr_delay)=51.200559049615656),
 Row(dest='JAC', avg(arr_delay)=51.142857142857146),
 Row(dest='TVC', avg(arr_delay)=49.64705882352941)]

### Take a sample of 1% of the flights and then calculate the average departure delay for that sample

In [160]:
flightsDFmini = flightsDF.sample(withReplacement=False,fraction=.01)
flightsDFmini.registerTempTable("flightmini")

In [162]:
sqlContext.sql("select avg(dep_delay) from flightmini").take(1)

[Row(avg(dep_delay)=14.335278101905873)]

### For all flights from JFK during June, show the average departure delay for each destination

In [66]:
sqlContext.sql("select dest,avg(dep_delay) from flight group by dest order by avg(dep_delay) desc").take(10)

[Row(dest='AVP', avg(dep_delay)=83.0),
 Row(dest='JAC', avg(dep_delay)=34.1),
 Row(dest='EGE', avg(dep_delay)=29.527272727272727),
 Row(dest='TUL', avg(dep_delay)=28.895522388059703),
 Row(dest='MSN', avg(dep_delay)=27.52777777777778),
 Row(dest='CAK', avg(dep_delay)=26.943502824858758),
 Row(dest='TVC', avg(dep_delay)=26.303571428571427),
 Row(dest='OKC', avg(dep_delay)=26.15695067264574),
 Row(dest='BHM', avg(dep_delay)=25.943548387096776),
 Row(dest='MTJ', avg(dep_delay)=24.5)]

### For every origin/dest pair, count the number of flights

In [72]:
sqlContext.sql("select origin,dest,count(origin) from flight group by dest, origin order by count(origin) desc").take(20)

[Row(origin='JFK', dest='LAX', count(origin)=10208),
 Row(origin='JFK', dest='SFO', count(origin)=7368),
 Row(origin='LGA', dest='ORD', count(origin)=7052),
 Row(origin='LGA', dest='ATL', count(origin)=6925),
 Row(origin='LGA', dest='MIA', count(origin)=5084),
 Row(origin='EWR', dest='SFO', count(origin)=4539),
 Row(origin='JFK', dest='MCO', count(origin)=4467),
 Row(origin='EWR', dest='BOS', count(origin)=4268),
 Row(origin='EWR', dest='LAX', count(origin)=4226),
 Row(origin='EWR', dest='ATL', count(origin)=4182),
 Row(origin='EWR', dest='MCO', count(origin)=4164),
 Row(origin='JFK', dest='BOS', count(origin)=4111),
 Row(origin='JFK', dest='SJU', count(origin)=4027),
 Row(origin='EWR', dest='CLT', count(origin)=3921),
 Row(origin='LGA', dest='DFW', count(origin)=3789),
 Row(origin='LGA', dest='DCA', count(origin)=3753),
 Row(origin='LGA', dest='DTW', count(origin)=3663),
 Row(origin='LGA', dest='CLT', count(origin)=3431),
 Row(origin='JFK', dest='LAS', count(origin)=3355),
 Row(origin