In [1]:
sc

org.apache.spark.SparkContext@5f41f8c6

In [2]:
// Data location
val airlinesPath="hdfs:///user/swethakolalapudi/flightDelaysData/airlines.csv"
val airportsPath="hdfs:///user/swethakolalapudi/flightDelaysData/airports.csv"
val flightsPath="hdfs:///user/swethakolalapudi/flightDelaysData/flights.csv"

In [3]:
// Load one dataset 
val airlines=sc.textFile(airlinesPath)

In [4]:
airlines

hdfs:///user/swethakolalapudi/flightDelaysData/airlines.csv MapPartitionsRDD[2] at textFile at <console>:23

In [6]:
// View the entire dataset
airlines.collect()

Array(Code,Description, "19031","Mackey International Inc.: MAC", "19032","Munz Northern Airlines Inc.: XY", "19033","Cochise Airlines Inc.: COC", "19034","Golden Gate Airlines Inc.: GSA", "19035","Aeromech Inc.: RZZ", "19036","Golden West Airlines Co.: GLW", "19037","Puerto Rico Intl Airlines: PRN", "19038","Air America Inc.: STZ", "19039","Swift Aire Lines Inc.: SWT", "19040","American Central Airlines: TSF", "19041","Valdez Airlines: VEZ", "19042","Southeast Alaska Airlines: WEB", "19043","Altair Airlines Inc.: AAR", "19044","Chitina Air Service: CHI", "19045","Marco Island Airways Inc.: MRC", "19046","Caribbean Air Services Inc.: OHZ", "19047","Sundance Airlines: PRO", "19048","Seair Alaska Airlines Inc.: SAI", "19049","Southeast Airlines Inc.: SLZ", "19050","A...

In [7]:
// Get the first line
airlines.first()

Code,Description

In [8]:
// View a few lines
airlines.take(10)

Array(Code,Description, "19031","Mackey International Inc.: MAC", "19032","Munz Northern Airlines Inc.: XY", "19033","Cochise Airlines Inc.: COC", "19034","Golden Gate Airlines Inc.: GSA", "19035","Aeromech Inc.: RZZ", "19036","Golden West Airlines Co.: GLW", "19037","Puerto Rico Intl Airlines: PRN", "19038","Air America Inc.: STZ", "19039","Swift Aire Lines Inc.: SWT")

In [9]:
airlines.count()

1580

In [5]:
airlines.filter(x => !x.contains("Description"))

MapPartitionsRDD[3] at filter at <console>:25

In [10]:
val airlinesWoHeader=airlines.filter(x => !x.contains("Description"))

In [11]:
airlinesWoHeader.take(10)

Array("19031","Mackey International Inc.: MAC", "19032","Munz Northern Airlines Inc.: XY", "19033","Cochise Airlines Inc.: COC", "19034","Golden Gate Airlines Inc.: GSA", "19035","Aeromech Inc.: RZZ", "19036","Golden West Airlines Co.: GLW", "19037","Puerto Rico Intl Airlines: PRN", "19038","Air America Inc.: STZ", "19039","Swift Aire Lines Inc.: SWT", "19040","American Central Airlines: TSF")

In [12]:
val airlinesParsed=airlinesWoHeader.map(_.split(','))

In [13]:
airlinesParsed.take(10)

Array(Array("19031", "Mackey International Inc.: MAC"), Array("19032", "Munz Northern Airlines Inc.: XY"), Array("19033", "Cochise Airlines Inc.: COC"), Array("19034", "Golden Gate Airlines Inc.: GSA"), Array("19035", "Aeromech Inc.: RZZ"), Array("19036", "Golden West Airlines Co.: GLW"), Array("19037", "Puerto Rico Intl Airlines: PRN"), Array("19038", "Air America Inc.: STZ"), Array("19039", "Swift Aire Lines Inc.: SWT"), Array("19040", "American Central Airlines: TSF"))

In [14]:
airlines.map(_.length).take(10)

Array(16, 40, 41, 36, 40, 28, 39, 40, 31, 36)

In [15]:
def notHeader(row: String): Boolean = {
    !row.contains("Description")
    }
airlines.filter(notHeader).take(10)

Array("19031","Mackey International Inc.: MAC", "19032","Munz Northern Airlines Inc.: XY", "19033","Cochise Airlines Inc.: COC", "19034","Golden Gate Airlines Inc.: GSA", "19035","Aeromech Inc.: RZZ", "19036","Golden West Airlines Co.: GLW", "19037","Puerto Rico Intl Airlines: PRN", "19038","Air America Inc.: STZ", "19039","Swift Aire Lines Inc.: SWT", "19040","American Central Airlines: TSF")

In [20]:
airlines.filter(notHeader).map(_.split(",")).take(10)

Array(Array("19031", "Mackey International Inc.: MAC"), Array("19032", "Munz Northern Airlines Inc.: XY"), Array("19033", "Cochise Airlines Inc.: COC"), Array("19034", "Golden Gate Airlines Inc.: GSA"), Array("19035", "Aeromech Inc.: RZZ"), Array("19036", "Golden West Airlines Co.: GLW"), Array("19037", "Puerto Rico Intl Airlines: PRN"), Array("19038", "Air America Inc.: STZ"), Array("19039", "Swift Aire Lines Inc.: SWT"), Array("19040", "American Central Airlines: TSF"))

In [37]:
airlinesWoHeader.map(_.replace("\"","")).map(_.split(',')).map(x => (x(0).toInt,x(1))).take(10)

Array((19031,Mackey International Inc.: MAC), (19032,Munz Northern Airlines Inc.: XY), (19033,Cochise Airlines Inc.: COC), (19034,Golden Gate Airlines Inc.: GSA), (19035,Aeromech Inc.: RZZ), (19036,Golden West Airlines Co.: GLW), (19037,Puerto Rico Intl Airlines: PRN), (19038,Air America Inc.: STZ), (19039,Swift Aire Lines Inc.: SWT), (19040,American Central Airlines: TSF))

In [174]:
def parseLookup(row: String): (String,String)={
 val x = row.replace("\"","").split(',')
 (x(0),x(1))
}

In [1]:
import org.joda.time._
import org.joda.time.format._
import org.joda.time.LocalTime
import org.joda.time.LocalDate

case class Flight(date: LocalDate,
                  airline: String ,
                  flightnum: String,
                  origin: String ,
                  dest: String ,
                  dep: LocalTime,
                  dep_delay: Double,
                  arv: LocalTime,
                  arv_delay: Double ,
                  airtime: Double ,
                  distance: Double
                   )



In [2]:




def parse(row: String): Flight={

  val fields = row.split(",")
  val datePattern = DateTimeFormat.forPattern("YYYY-mm-dd")
  val timePattern = DateTimeFormat.forPattern("HHmm")

  val date: LocalDate = datePattern.parseDateTime(fields(0)).toLocalDate()
  val airline: String = fields(1)
  val flightnum: String = fields(2)
  val origin: String = fields(3)
  val dest: String = fields(4)
  val dep: LocalTime = timePattern.parseDateTime(fields(5)).toLocalTime()
  val dep_delay: Double = fields(6).toDouble
  val arv: LocalTime = timePattern.parseDateTime(fields(7)).toLocalTime()
  val arv_delay: Double = fields(8).toDouble
  val airtime: Double = fields(9).toDouble
  val distance: Double = fields(10).toDouble
  
  Flight(date,airline,flightnum,origin,dest,dep,
         dep_delay,arv,arv_delay,airtime,distance)

    }



In [6]:
val flights=sc.textFile(flightsPath)

In [7]:
flights

hdfs:///user/swethakolalapudi/flightDelaysData/flights.csv MapPartitionsRDD[5] at textFile at <console>:22

In [1]:
// The total number of records 
flights.count()

476881

In [3]:
// The first row
flights.first()

2014-04-01,19805,1,JFK,LAX,0854,-6.00,1217,2.00,355.00,2475.00

In [1]:
flights.map(_.split(","))

MapPartitionsRDD[6] at map at <console>:25

In [2]:
flights.map(x => x.split(","))

MapPartitionsRDD[7] at map at <console>:25

In [3]:
val flightsParsed=flights.map(parse)

In [4]:
// Let's take a look at the data in the Parsed RDD 
flightsParsed.first()

Flight(2014-01-01,19805,1,JFK,LAX,08:54:00.000,-6.0,12:17:00.000,2.0,355.0,2475.0)

In [112]:
val totalDistance=flightsParsed.map(_.distance).reduce((x,y) => x+y)

In [115]:
val avgDistance=totalDistance/flightsParsed.count()

In [116]:
println(avgDistance)

794.8585013871385


In [117]:
// % flights with delays
flightsParsed.filter(_.dep_delay>0).count().toDouble/flightsParsed.count().toDouble

0.3753871510922012

In [118]:
flightsParsed.persist()

MapPartitionsRDD[38] at map at <console>:272

In [133]:
val sumCount=flightsParsed.map(_.dep_delay).aggregate((0.0,0))((acc, value) => (acc._1 + value, acc._2+1),
                                                           (acc1,acc2) => (acc1._1+acc2._1,acc1._2+acc2._2))



In [126]:
sumCount._1/sumCount._2

8.313877046894298

In [130]:
sumCount.getClass

class scala.Tuple2$mcDI$sp

In [140]:
// Histogram of delays
flightsParsed.map(x => (x.dep_delay/60).toInt).countByValue()

Map(0 -> 452963, 5 -> 249, 10 -> 15, 24 -> 3, 25 -> 1, 14 -> 13, 20 -> 4, 1 -> 16016, 6 -> 113, 28 -> 1, 21 -> 3, 9 -> 26, 13 -> 15, 2 -> 4893, 17 -> 2, 12 -> 9, 7 -> 66, 3 -> 1729, 11 -> 12, 8 -> 43, 4 -> 701, 15 -> 4)

In [153]:
val airportDelays = flightsParsed.map(x => (x.origin,x.dep_delay))

In [154]:
airportDelays.keys.take(10)

Array(JFK, LAX, JFK, LAX, DFW, OGG, DFW, HNL, JFK, LAX)

In [155]:
airportDelays.values.take(10)

Array(-6.0, 14.0, -6.0, 25.0, -5.0, 126.0, 125.0, 4.0, -7.0, 21.0)

In [157]:
val airportTotalDelay=airportDelays.reduceByKey((x,y) => x+y)

In [158]:
val airportCount=airportDelays.mapValues(x => 1).reduceByKey((x,y) => x+y)

In [159]:
val airportSumCount=airportTotalDelay.join(airportCount)

In [160]:
val airportAvgDelay=airportSumCount.mapValues(x => x._1/x._2.toDouble)

In [161]:
airportAvgDelay.sortBy(-_._2).take(10)

Array((PPG,56.25), (EGE,32.0), (OTH,24.533333333333335), (LAR,18.892857142857142), (RDD,18.55294117647059), (MTJ,18.363636363636363), (PUB,17.54), (EWR,16.478549005929544), (CIC,15.931034482758621), (RST,15.6993006993007))

In [163]:
val airportSumCount2=airportDelays.combineByKey(
                                            value => (value,1),
                                            (acc: (Double,Int), value) =>  (acc._1 + value, acc._2+1),
                                            (acc1: (Double,Int), acc2: (Double,Int)) => (acc1._1+acc2._1,acc1._2+acc2._2))

In [164]:
val airportAvgDelay2=airportSumCount2.mapValues(x => x._1/x._2.toDouble)

In [165]:
airportAvgDelay2.sortBy(-_._2).take(10)

Array((PPG,56.25), (EGE,32.0), (OTH,24.533333333333335), (LAR,18.892857142857142), (RDD,18.55294117647059), (MTJ,18.363636363636363), (PUB,17.54), (EWR,16.478549005929544), (CIC,15.931034482758621), (RST,15.6993006993007))

In [175]:
val airports=sc.textFile(airportsPath).filter(notHeader).map(parseLookup)

In [176]:
airports.lookup("PPG")

WrappedArray(Pago Pago)

In [178]:
val airportLookup=airports.collectAsMap

In [180]:
airportLookup("CLI")

Clintonville

In [181]:
airportAvgDelay.map(x=>(airportLookup(x._1),x._2)).take(10)

Array((Santa Maria,5.285714285714286), (Wichita Falls,8.717948717948717), (Manhattan/Ft. Riley,3.9705882352941178), (Bloomington/Normal,4.86), (Helena,-2.048076923076923), (Sun Valley/Hailey/Ketchum,-4.408163265306122), (Richmond,8.803352675693102), (Ponce,-0.8103448275862069), (Salt Lake City,3.5174873446847674), (New Bern/Morehead/Beaufort,5.660714285714286))

In [182]:
val airportBC=sc.broadcast(airportLookup)

In [183]:
airportAvgDelay.map(x => (airportBC.value(x._1),x._2)).sortBy(-_._2).take(10)

Array((Pago Pago,56.25), (Eagle,32.0), (North Bend/Coos Bay,24.533333333333335), (Laramie,18.892857142857142), (Redding,18.55294117647059), (Montrose/Delta,18.363636363636363), (Pueblo,17.54), (Newark,16.478549005929544), (Chico,15.931034482758621), (Rochester,15.6993006993007))