# Task 1

In [0]:
dbfs_fileStore_prefix = "/FileStore/tables"
prefix = "ontimeperformance"
size = "small"
year = 2000

In [0]:
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.streaming import StreamingContext
import pyspark.sql.functions as f
from pyspark.sql.functions import col, avg, min, max, abs, concat_ws, rank
from pyspark.sql.window import Window
import pandas as pd
import numpy as np
from pyspark.statcounter import StatCounter
from sparkmeasure import StageMetrics


In [0]:
def task_1(spark_session, flights_path, aircrafts_path):
  stagemetrics = StageMetrics(spark)
  stagemetrics.begin()
  ss=spark_session.builder
  #clean the Schema of flights
  clena_space_Schema = StructType([StructField('carrier_code', StringType(), True),
                     StructField('flight_number', StringType(), True),
                     StructField('flight_date', StringType(), True),
                     StructField('origin', StringType(), True),
                     StructField('destination', StringType(), True),
                     StructField('tailnum', StringType(), True),
                     StructField('scheduled_depature_time', StringType(), True),
                     StructField('scheduled_arrival_time', StringType(), True),
                     StructField('actual_departure_time', StringType(), True),
                     StructField('actual_arrival_time', StringType(), True),
                     StructField('distance', StringType(), True)])
  
  DF_Flights = spark.read.format("csv").option("header", "true").schema(clena_space_Schema).load(flights_path).cache()
  DF_Aircrafts = spark.read.csv(aircrafts_path, header="true").cache()
  #rename
  DF_Flights_clean=DF_Flights
  DF_Aircrafts_clean=DF_Aircrafts

  #find the model that is manufactured by CESSNA
  CESSNA_ma = DF_Aircrafts_clean.filter(col('manufacturer') == "CESSNA").cache()

  join_DF = CESSNA_ma.join(DF_Flights_clean, on=['tailnum'], how='inner')
  #agg
  join_DF = join_DF.withColumn('manufacturer',f.lower(f.col('manufacturer')))
  join_DF = join_DF.withColumn('manufacturer',f.initcap(f.col('manufacturer')))
  
  #get the pure numeric value of the model
  join_DF = join_DF.withColumn("model",f.regexp_extract("model", "\\d+", 0))

  #get first three digit
  join_DF = join_DF.withColumn('model',f.col('model').substr(0,3))
  
  #groupby
  join_DF = join_DF.withColumn('model',f.col('model')).groupBy('manufacturer','model').count().sort('count',ascending=False).limit(3)

  join_DF = join_DF.select(f.format_string('%s %s', join_DF.manufacturer, join_DF.model).alias('models'),f.col('count').alias('numberOfDepartingFlights'))
  display(join_DF)
  
  stagemetrics.end()
  stagemetrics.print_report()
  # formatting and out put the result
  
  #join_DF.write.format("com.databricks.spark.csv").option("delimiter","\t").csv("/FileStore/task1dataframe_result/resulttask1_file.csv")
  

In [0]:
task_1(spark, f"{dbfs_fileStore_prefix}/{prefix}_flights_small.csv", f"{dbfs_fileStore_prefix}/{prefix}_aircrafts.csv")

models,numberOfDepartingFlights
Cessna 172,57
Cessna 210,48
Cessna 421,47


In [0]:
task_1(spark, f"{dbfs_fileStore_prefix}/{prefix}_flights_massive.csv", f"{dbfs_fileStore_prefix}/{prefix}_aircrafts.csv")

models,numberOfDepartingFlights
Cessna 210,37771
Cessna 172,32853
Cessna 421,32817


In [0]:
def task_1_RDD1(spark_session, flights_path, aircrafts_path):
  stagemetrics = StageMetrics(spark)
  stagemetrics.begin()
  s_s=spark_session.builder
  
  #
  #clean the Schema of flights
  clean_space_Schema = StructType([StructField('carrier_code', StringType(), True),
                     StructField('flight_number', StringType(), True),
                     StructField('flight_date', StringType(), True),
                     StructField('origin', StringType(), True),
                     StructField('destination', StringType(), True),
                     StructField('tailnum', StringType(), True),
                     StructField('scheduled_depature_time', StringType(), True),
                     StructField('scheduled_arrival_time', StringType(), True),
                     StructField('actual_departure_time', StringType(), True),
                     StructField('actual_arrival_time', StringType(), True),
                     StructField('distance', StringType(), True)])
  
  Flights_df = spark.read.format("csv").option("header", "true").schema(clean_space_Schema).load(flights_path)
  Aircrafts_df = spark.read.csv(aircrafts_path, header="true")
  
  # create rdd of the dataframe
  Flights_rdd = Flights_df.rdd
  Aircrafts_rdd = Aircrafts_df.rdd
  
  #filter the manufactural
  manufact_filter = Aircrafts_rdd.filter(lambda x: x["manufacturer"] == 'CESSNA')
  Flights_filter  = Flights_rdd.filter(lambda x: x["tailnum"] != None)
  
  #filter the key that only leave tailnum manu and model
  key_manufact_filter = manufact_filter.map(lambda x: (x[0], x[2:]))
  
  
  key_flight_filter = Flights_filter.map(lambda x: (x[5],x[1]))
  
  #convert to "Cessna XYZ"
  def seq(a,b):
    if str(b)[26] == 'T':
      return str(a) + ' ' + str(b)[27:30]
    return str(a) + ' ' + str(b)[26:29]
  
  def combine(a,b):
    return a
  
  key_manufact_filter = key_manufact_filter.aggregateByKey('Cessna', seq, combine)
  
  #join table 
  newRdd = key_manufact_filter.join(key_flight_filter)

  #count number
  count = newRdd.map(lambda x:(x[1][0], 1))
  count = count.reduceByKey(lambda x,y: (x+y))
  
  #sort
  count = count.map(lambda x:(x[1], x[0])).sortByKey(False)
  result = count.map(lambda x:(x[1], x[0]))
  
  print(result.take(3))
  
  stagemetrics.end()
  stagemetrics.print_report()

In [0]:
task_1_RDD1(spark, f"{dbfs_fileStore_prefix}/{prefix}_flights_small.csv", f"{dbfs_fileStore_prefix}/{prefix}_aircrafts.csv")

In [0]:
task_1_RDD1(spark, f"{dbfs_fileStore_prefix}/{prefix}_flights_massive.csv", f"{dbfs_fileStore_prefix}/{prefix}_aircrafts.csv")