# Flights Dataset Preprocessing and feature Engineering

In [2]:
%AddJar file:///home/jovyan/work/apps/Emiasd-Flight-Data-Analysis.jar

Starting download from file:///home/jovyan/work/apps/Emiasd-Flight-Data-Analysis.jar
Finished download of Emiasd-Flight-Data-Analysis.jar
Using cached version of Emiasd-Flight-Data-Analysis.jar


In [3]:
import org.apache.spark.sql.SparkSession
import com.flightdelay.config.{AppConfiguration, ConfigurationLoader}
import com.flightdelay.data.loaders.FlightDataLoader

//Env Configuration
val args: Array[String] = Array("jupyter")

val spark = SparkSession.builder()
  .config(sc.getConf)
  .getOrCreate()

// Rendre la session Spark implicite
implicit val session = spark
implicit val configuration: AppConfiguration = ConfigurationLoader.loadConfiguration(args)

val rawFlightsPath = s"${configuration.common.output.basePath}/common/data/processed_flights.parquet"
val flightsDF = spark.read.parquet(rawFlightsPath)

args = Array(jupyter)
spark = org.apache.spark.sql.SparkSession@16b09d4a
session = org.apache.spark.sql.SparkSession@16b09d4a
configuration = AppConfiguration(local,CommonConfig(42,DataConfig(/home/jovyan/work/data,FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Flights/201201.csv),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Weather/201201hourly.txt),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/wban_airport_timezone.csv)),OutputConfig(/home/jovyan/work/output,FileConfig(/home/jovyan/work/output/data),FileCo...


AppConfiguration(local,CommonConfig(42,DataConfig(/home/jovyan/work/data,FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Flights/201201.csv),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Weather/201201hourly.txt),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/wban_airport_timezone.csv)),OutputConfig(/home/jovyan/work/output,FileConfig(/home/jovyan/work/output/data),FileCo...

In [4]:
flightsDF.count()

403884

In [5]:
flightsDF.printSchema

root
 |-- FL_DATE: date (nullable = true)
 |-- OP_CARRIER_AIRLINE_ID: integer (nullable = true)
 |-- OP_CARRIER_FL_NUM: integer (nullable = true)
 |-- ORIGIN_AIRPORT_ID: integer (nullable = true)
 |-- DEST_AIRPORT_ID: integer (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- ARR_DELAY_NEW: double (nullable = true)
 |-- CRS_ELAPSED_TIME: double (nullable = true)
 |-- WEATHER_DELAY: double (nullable = true)
 |-- NAS_DELAY: double (nullable = true)
 |-- ORIGIN_WBAN: string (nullable = true)
 |-- ORIGIN_TIMEZONE: integer (nullable = true)
 |-- DEST_WBAN: string (nullable = true)
 |-- DEST_TIMEZONE: integer (nullable = true)
 |-- UTC_CRS_DEP_TIME: string (nullable = true)
 |-- UTC_ARR_TIME: string (nullable = true)
 |-- UTC_ARR_DATE: date (nullable = true)
 |-- CRS_ARR_TIME: string (nullable = true)
 |-- CRS_ARR_DATE: date (nullable = true)
 |-- feature_arrival_hour: integer (nullable = true)
 |-- feature_utc_arrival_hour: integer (nullable = true)
 |-- feature_utc_arrival

In [6]:
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.ml.feature.{StringIndexer, OneHotEncoder, VectorAssembler}
import scala.util.Try

In [7]:
flightsDF.count()

403884

# Flight Delay Metrics

In [8]:
import org.apache.spark.sql.functions._

val delayColumns = Seq(
  "FL_DATE",
  "OP_CARRIER_FL_NUM",
  "ORIGIN_AIRPORT_ID",
  "WEATHER_DELAY",
  "NAS_DELAY",
  "ARR_DELAY_NEW",

  /**"label_is_early",
  "label_is_on_time",
    
  "label_arr_delay_filled",
  "label_nas_delay_filled",
  "label_weather_delay_filled",
    
  "label_weather_delay_was_missing",
  "label_nas_delay_was_missing",
  "label_arr_delay_was_missing",**/
  
  "label_total_weather_nas_delay",

  /**"label_has_nas_delay",
  "label_has_weather_delay",
  "label_has_any_weather_nas_delay",**/
    
  "label_is_delayed_15min",
  /**"label_is_delayed_30min",
  "label_is_delayed_60min",
  "label_is_delayed_45min",
  "label_is_delayed_90min",
  
  "feature_avg_delay",
  "feature_num_previous_flights",
  "feature_stddev_delay",
  "feature_max_delay",
  "feature_min_delay",**/

  "feature_proportion_delayed_15min",    
  /**"feature_proportion_delayed_30min",
  "feature_proportion_delayed_60min",
  "feature_proportion_delayed_45min",
  "feature_proportion_delayed_90min"**/
)

val df = flightsDF
  .select(delayColumns.head, delayColumns.tail: _*)
  .filter(
    col("FL_DATE") === "2013-03-11" &&
    col("OP_CARRIER_FL_NUM") === 3508 &&  
    col("WEATHER_DELAY").isNotNull && col("WEATHER_DELAY") > 0 &&
    col("NAS_DELAY").isNotNull && col("NAS_DELAY") > 0
  )

df.show(5)


+----------+-----------------+-----------------+-------------+---------+-------------+-----------------------------+----------------------+--------------------------------+
|   FL_DATE|OP_CARRIER_FL_NUM|ORIGIN_AIRPORT_ID|WEATHER_DELAY|NAS_DELAY|ARR_DELAY_NEW|label_total_weather_nas_delay|label_is_delayed_15min|feature_proportion_delayed_15min|
+----------+-----------------+-----------------+-------------+---------+-------------+-----------------------------+----------------------+--------------------------------+
|2013-03-11|             3508|            10397|          3.0|     11.0|         18.0|                         14.0|                     1|             0.03508771929824561|
+----------+-----------------+-----------------+-------------+---------+-------------+-----------------------------+----------------------+--------------------------------+



delayColumns = List(FL_DATE, OP_CARRIER_FL_NUM, ORIGIN_AIRPORT_ID, WEATHER_DELAY, NAS_DELAY, ARR_DELAY_NEW, label_total_weather_nas_delay, label_is_delayed_15min, feature_proportion_delayed_15min)
df = [FL_DATE: date, OP_CARRIER_FL_NUM: int ... 7 more fields]


[FL_DATE: date, OP_CARRIER_FL_NUM: int ... 7 more fields]

# BTS Data

In [9]:
val DEFAULT_DATE_FORMAT = "yyyymmdd"
val filePath = s"${configuration.common.data.basePath}/BTS-201303/T_ONTIME_REPORTING_201303.csv"

val btsDF = spark.read.format("csv")
  .option("header", "true")
  //.schema(expectedSchema)
  .option("timestampFormat", DEFAULT_DATE_FORMAT)
  .option("multiline", "true")
  .option("escape", "\"")
  .load(filePath)
  //.withColumn("FL_DATE", to_date(col("FL_DATE"), DEFAULT_DATE_FORMAT))

DEFAULT_DATE_FORMAT = yyyy-MM-dd
filePath = /home/jovyan/work/data/BTS-201303/T_ONTIME_REPORTING_201303.csv
btsDF = [ORIGIN_AIRPORT_ID: string, ORIGIN_AIRPORT_SEQ_ID: string ... 36 more fields]


[ORIGIN_AIRPORT_ID: string, ORIGIN_AIRPORT_SEQ_ID: string ... 36 more fields]

In [10]:
btsDF.printSchema

root
 |-- ORIGIN_AIRPORT_ID: string (nullable = true)
 |-- ORIGIN_AIRPORT_SEQ_ID: string (nullable = true)
 |-- ORIGIN_CITY_MARKET_ID: string (nullable = true)
 |-- DEST_AIRPORT_ID: string (nullable = true)
 |-- DEST_AIRPORT_SEQ_ID: string (nullable = true)
 |-- DEST_CITY_MARKET_ID: string (nullable = true)
 |-- CRS_DEP_TIME: string (nullable = true)
 |-- DEP_TIME: string (nullable = true)
 |-- DEP_DELAY: string (nullable = true)
 |-- DEP_DELAY_NEW: string (nullable = true)
 |-- DEP_DEL15: string (nullable = true)
 |-- DEP_DELAY_GROUP: string (nullable = true)
 |-- DEP_TIME_BLK: string (nullable = true)
 |-- TAXI_OUT: string (nullable = true)
 |-- WHEELS_OFF: string (nullable = true)
 |-- WHEELS_ON: string (nullable = true)
 |-- TAXI_IN: string (nullable = true)
 |-- CRS_ARR_TIME: string (nullable = true)
 |-- ARR_TIME: string (nullable = true)
 |-- ARR_DELAY: string (nullable = true)
 |-- ARR_DELAY_NEW: string (nullable = true)
 |-- ARR_DEL15: string (nullable = true)
 |-- ARR_DELAY_G