# Task 2

In [0]:
dbfs_fileStore_prefix = "/FileStore/tables"
prefix = "ontimeperformance"
size = "small"
year = 2000

In [0]:
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.streaming import StreamingContext
import pyspark.sql.functions as f
from pyspark.sql.functions import col, avg, min, max, abs, concat_ws, rank
from pyspark.sql.window import Window
import pandas as pd
import numpy as np
from pyspark.statcounter import StatCounter
from sparkmeasure import StageMetrics

In [0]:
def task_2(spark_session, flights_path, airlines_path, year):
  stagemetrics = StageMetrics(spark)
  stagemetrics.begin()
  ss=spark_session.builder
    
  #clean the Schema of flights
  clean_space_Schema = StructType([StructField('carrier_code', StringType(), True),
                     StructField('flight_number', StringType(), True),
                     StructField('flight_date', StringType(), True),
                     StructField('origin', StringType(), True),
                     StructField('destination', StringType(), True),
                     StructField('tailnum', StringType(), True),
                     StructField('scheduled_departure_time', StringType(), True),
                     StructField('scheduled_arrival_time', StringType(), True),
                     StructField('actual_departure_time', StringType(), True),
                     StructField('actual_arrival_time', StringType(), True),
                     StructField('distance', StringType(), True)])
  DF_Flights = spark.read.format("csv").option("header", "true").schema(clean_space_Schema).load(flights_path)[['carrier_code', 'flight_date', 'scheduled_departure_time', 'actual_departure_time']].cache()
  DF_Airlines = spark.read.csv(airlines_path, header="true")[['carrier_code', 'name', 'country']].cache()
  
  #clean the null value
  DF_Flights_clean=DF_Flights.na.drop()
  DF_Airlines_clean=DF_Airlines.na.drop()
  
  #filter US airlines
  us_c_n = DF_Airlines_clean.select('carrier_code', 'name').filter("country='United States'").dropDuplicates()
  
  #filter year 
  DF_Flights_clean = DF_Flights_clean.filter(f.col("flight_date").substr(0,4) == str(year)).cache()
  
  #conver to minutes
  DF_Flights_clean = DF_Flights_clean.withColumn('scheduled_departure_time', f.col('scheduled_departure_time').substr(0,2).cast("int")*60 + f.col('scheduled_departure_time').substr(4,2).cast("int"))
  DF_Flights_clean = DF_Flights_clean.withColumn('actual_departure_time', f.col('actual_departure_time').substr(0,2).cast("int")*60 + f.col('actual_departure_time').substr(4,2).cast("int"))
                                  
  #filter next_day's delay 
  DF_Flights_clean = DF_Flights_clean.withColumn("actual_departure_time", f.when(f.col("actual_departure_time").cast("int") -f.col("scheduled_departure_time").cast("int") < -720, f.col("actual_departure_time").cast("int") + 1440).otherwise(f.col("actual_departure_time").cast("int")))
  
  #filter delay's flight
  DF_Flights_delay = DF_Flights_clean.filter(f.col("scheduled_departure_time").cast("int") < f.col("actual_departure_time").cast("int")).cache()
  
  #calculate how long the delay is 
  DF_Flights_delay = DF_Flights_delay.withColumn("delay", f.col("actual_departure_time").cast("int") - f.col("scheduled_departure_time").cast("int"))
  
  #join US airline
  join_DF_us_airline = us_c_n.join(DF_Flights_delay, on=['carrier_code'], how='inner').cache()
  
  
  #calculate mean + count + min + max
  ave_max_min = join_DF_us_airline.withColumn('delay', f.col('delay')).groupBy('name').agg(
    f.count(f.col("name")).alias("num_delays"),
    f.avg(f.col("delay").cast("int")).alias("average_delay"),
    f.min(f.col("delay").cast("int")).alias("min_delay"),
    f.max(f.col("delay").cast("int")).alias("max_delay")
  )
  #timestampe
  #.cast().cast(long)
  
  # rename
  ave_max_min = ave_max_min.withColumnRenamed("name", "airline_name")
  
  display(ave_max_min)
  stagemetrics.end()
  stagemetrics.print_report()
  #ave_max_min.write.format("com.databricks.spark.csv").option("delimiter","\t").csv("/FileStore/task2dataframe_result/resulttask2_file.csv")

In [0]:
task_2(spark, f"{dbfs_fileStore_prefix}/{prefix}_flights_{size}.csv", f"{dbfs_fileStore_prefix}/{prefix}_airlines.csv", year)

airline_name,num_delays,average_delay,min_delay,max_delay
United Airlines,385,35.212987012987014,1,906
Southwest Airlines Co.,279,38.71326164874552,1,189
Continental Air Lines Inc.,184,18.57608695652174,1,236
Northwest Airlines Inc.,242,21.446280991735534,1,237
US Airways,361,20.63711911357341,1,476
Alaska Airlines Inc.,58,37.87931034482759,1,120
Delta Air Lines Inc.,487,24.87679671457905,1,1375
American Airlines Inc.,310,32.07096774193548,1,1146


In [0]:
def task_2_RDD(spark_session, flights_path, airlines_path, year):
  ss=spark_session.builder
  stagemetrics = StageMetrics(spark)
  stagemetrics.begin()
  #clean the Schema of flights
  clean_space_Schema = StructType([StructField('carrier_code', StringType(), True),
                     StructField('flight_number', StringType(), True),
                     StructField('flight_date', StringType(), True),
                     StructField('origin', StringType(), True),
                     StructField('destination', StringType(), True),
                     StructField('tailnum', StringType(), True),
                     StructField('scheduled_departure_time', StringType(), True),
                     StructField('scheduled_arrival_time', StringType(), True),
                     StructField('actual_departure_time', StringType(), True),
                     StructField('actual_arrival_time', StringType(), True),
                     StructField('distance', StringType(), True)])
  
  #import data
  DF_Flights = spark.read.format("csv").option("header", "true").schema(clean_space_Schema).load(flights_path)[['carrier_code', 'flight_date', 'scheduled_departure_time', 'actual_departure_time']].dropna()
  DF_Airlines = spark.read.csv(airlines_path, header="true")[['carrier_code', 'name', 'country']].dropna()
  
  #create rdd
  rdd_Flight = DF_Flights.rdd
  rdd_Airlines = DF_Airlines.rdd
  
  #filter the US Airlines
  us_airlines = rdd_Airlines.filter(lambda x: x[2] == 'United States')
  
  #filter year
  year_flight = rdd_Flight.filter(lambda x: x[1][0:4] == str(year))
  
  #create key
  key_us_airlines = us_airlines.map(lambda x: (x[0], x[1]))
  key_year_flight = year_flight.map(lambda x: (x[0], x[2:]))
  
  #join
  newRdd = key_us_airlines.join(key_year_flight)
  
  #calculate delay
  count = newRdd.map(lambda x:(x[1][0], (int(x[1][1][1][0:2])*60 + int(x[1][1][1][2:4])) - (int(x[1][1][0][0:2])*60 + int(x[1][1][0][2:4])) ))
  # count = count.filter(lambda x: x[1] > 0)
  
  #filter delay next day ?? Assume
  count = count.map(lambda x: (x[0], x[1]+24*60) if x[1] < -12*60 else (x[0], x[1]))
  
  #filter delayed flights
  count = count.filter(lambda x: x[1] > 0)
  
  #calculate num_delay
  num_delay = count.map(lambda x:(x[0], 1) if x[1] > 0 else (x[0], 0))
  num_delay = num_delay.reduceByKey(lambda x,y: (x+y))
  
  #calculate average_delay
  sum_delay = count.map(lambda x:(x[0], x[1]) if x[1] > 0 else (x[0], 0))
  sum_delay = sum_delay.reduceByKey(lambda x,y: (x+y))
  average_delay = sum_delay.join(num_delay).map(lambda x:(x[0], x[1][0]/x[1][1]))
  
  #calculate min_delay
  min_delay = count.aggregateByKey(StatCounter(), StatCounter.merge, StatCounter.mergeStats).mapValues(lambda s: (s.min()))
  
  #calculate max_delay
  max_delay = count.aggregateByKey(StatCounter(), StatCounter.merge, StatCounter.mergeStats).mapValues(lambda s: (s.max()))
  
  print(max_delay.collect())
  stagemetrics.end()
  stagemetrics.print_report()

In [0]:
task_2_RDD(spark, f"{dbfs_fileStore_prefix}/{prefix}_flights_massive.csv", f"{dbfs_fileStore_prefix}/{prefix}_airlines.csv", year)