# Flight Preprocessing - Data Generation

In [1]:
%AddJar file:///home/jovyan/work/apps/Emiasd-Flight-Data-Analysis.jar

Starting download from file:///home/jovyan/work/apps/Emiasd-Flight-Data-Analysis.jar
Finished download of Emiasd-Flight-Data-Analysis.jar
Using cached version of Emiasd-Flight-Data-Analysis.jar


In [2]:
import org.apache.spark.sql.SparkSession
import com.flightdelay.config.{AppConfiguration, ConfigurationLoader}
import com.flightdelay.data.loaders.FlightDataLoader

//Env Configuration
val args: Array[String] = Array("local")

val spark = SparkSession.builder()
  .config(sc.getConf)
  .getOrCreate()

// Rendre la session Spark implicite
implicit val session = spark
implicit val configuration: AppConfiguration = ConfigurationLoader.loadConfiguration(args)

// Cellule 4: Test
val rawParquetPath = s"../../output/data/raw_flights.parquet"
val flightData = spark.read.parquet(rawParquetPath)

args = Array(local)
spark = org.apache.spark.sql.SparkSession@6b440030
session = org.apache.spark.sql.SparkSession@6b440030
configuration = AppConfiguration(local,DataConfig(/data,FileConfig(/data/FLIGHT-3Y/Flights/201201.csv),FileConfig(/data/FLIGHT-3Y/Weather/201201hourly.txt),FileConfig(/data/FLIGHT-3Y/wban_airport_timezone.csv)),OutputConfig(/output,FileConfig(/output/data),FileConfig(/output/model)))
rawParquetPath = ../../output/data/raw_flights.parquet


flightData: org.apache.spark.sql.Da...


../../output/data/raw_flights.parquet

In [3]:
import com.flightdelay.data.utils.DataQualityMetrics

val flightDataMetrics = DataQualityMetrics.metrics(flightData)
flightDataMetrics.show()

+--------------------+-----------+-------+----------+----------------+
|                name|   origType|colType| compRatio|nbDistinctValues|
+--------------------+-----------+-------+----------+----------------+
|             FL_DATE|   DateType|   date|       1.0|              31|
|OP_CARRIER_AIRLIN...|IntegerType|numeric|       1.0|              15|
|   OP_CARRIER_FL_NUM|IntegerType|numeric|       1.0|            6237|
|   ORIGIN_AIRPORT_ID|IntegerType|numeric|       1.0|             287|
|     DEST_AIRPORT_ID|IntegerType|numeric|       1.0|             287|
|        CRS_DEP_TIME|IntegerType|numeric|       1.0|            1153|
|       ARR_DELAY_NEW| DoubleType|numeric| 0.9833214|             549|
|           CANCELLED| DoubleType|numeric|       1.0|               2|
|            DIVERTED| DoubleType|numeric|       1.0|               2|
|    CRS_ELAPSED_TIME| DoubleType|numeric|       1.0|             419|
|       WEATHER_DELAY| DoubleType|numeric|0.14586131|             288|
|     

flightDataMetrics = [name: string, origType: string ... 3 more fields]


[name: string, origType: string ... 3 more fields]

In [4]:
import com.flightdelay.data.preprocessing.FlightDataCleaner

import com.flightdelay.data.preprocessing.FlightDataCleaner

val flightCleanedData = FlightDataCleaner.preprocess(flightData)




----------------------------------------------------------------------------------------------------------
--> [FlightDataCleaner] Flight Data Cleaner - Start ...
----------------------------------------------------------------------------------------------------------
Data Original Count: 486133

Phase 1: Basic Cleaning - Remove Duplicates
Current Count : 486133

Phase 2: Filter Flights
- Filter Cancelled and Diverted Flights
Suppression des valeurs spécifiques: Map(CANCELLED -> List(1.0), DIVERTED -> List(1.0))
Nombre de lignes avant: 486133
Nombre de lignes après suppression des valeurs spécifiques: 478025
- Filter Invalid departure time
- Filter Invalid airports
Current Count : 478025

Phase 3: Types Conversion
Conversion des types de données: NAS_DELAY, OP_CARRIER_AIRLINE_ID, OP_CARRIER_FL_NUM, WEATHER_DELAY, DEST_AIRPORT_ID, ORIGIN_AIRPORT_ID, CRS_ELAPSED_TIME, FL_DATE, CRS_DEP_TIME, ARR_DELAY_NEW
- Filter Invalid flight date formats
Current Count : 478025

Phase 4: Filter Outl

flightCleanedData = [FL_DATE: string, OP_CARRIER_AIRLINE_ID: int ... 8 more fields]


[FL_DATE: string, OP_CARRIER_AIRLINE_ID: int ... 8 more fields]

In [5]:
import com.flightdelay.data.utils.DataQualityMetrics

val flightCleanedDataMetrics = DataQualityMetrics.metrics(flightCleanedData)
flightCleanedDataMetrics.show()

+--------------------+-----------+-------+----------+----------------+
|                name|   origType|colType| compRatio|nbDistinctValues|
+--------------------+-----------+-------+----------+----------------+
|             FL_DATE| StringType|textual|       1.0|              31|
|OP_CARRIER_AIRLIN...|IntegerType|numeric|       1.0|              15|
|   OP_CARRIER_FL_NUM| StringType|textual|       1.0|            6237|
|   ORIGIN_AIRPORT_ID|IntegerType|numeric|       1.0|             287|
|     DEST_AIRPORT_ID|IntegerType|numeric|       1.0|             287|
|        CRS_DEP_TIME|IntegerType|numeric|       1.0|            1153|
|       ARR_DELAY_NEW| DoubleType|numeric|       1.0|             486|
|    CRS_ELAPSED_TIME| DoubleType|numeric|       1.0|             418|
|       WEATHER_DELAY| DoubleType|numeric|0.14821951|             278|
|           NAS_DELAY| DoubleType|numeric|0.14821951|             282|
+--------------------+-----------+-------+----------+----------------+



flightCleanedDataMetrics = [name: string, origType: string ... 3 more fields]


[name: string, origType: string ... 3 more fields]

In [6]:
import com.flightdelay.data.preprocessing.FlightDataGenerator

val withTemporalFeatures = FlightDataGenerator.addTemporalFeatures(flightCleanedData)




Phase 1: Add Temporal Features
- Add feature_flight_timestamp
- Add feature_flight_year
- Add feature_flight_month
- Add feature_flight_quarter
- Add feature_flight_day_of_month
- Add feature_flight_day_of_week
- Add feature_flight_day_of_year
- Add feature_flight_week_of_year
- Add feature_departure_hour
- Add feature_departure_minute
- Add feature_departure_hour_decimal
- Add feature_departure_quarter_day
- Add feature_departure_quarter_name
- Add feature_departure_time_period
- Add feature_minutes_since_midnight


withTemporalFeatures = [FL_DATE: string, OP_CARRIER_AIRLINE_ID: int ... 24 more fields]


Temporal features added: 16
root
 |-- FL_DATE: string (nullable = true)
 |-- OP_CARRIER_AIRLINE_ID: integer (nullable = true)
 |-- OP_CARRIER_FL_NUM: string (nullable = true)
 |-- ORIGIN_AIRPORT_ID: integer (nullable = true)
 |-- DEST_AIRPORT_ID: integer (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- ARR_DELAY_NEW: double (nullable = true)
 |-- CRS_ELAPSED_TIME: double (nullable = true)
 |-- WEATHER_DELAY: double (nullable = true)
 |-- NAS_DELAY: double (nullable = true)
 |-- feature_departure_minute: integer (nullable = true)
 |-- feature_flight_day_of_week: integer (nullable = true)
 |-- feature_departure_hour_decimal: double (nullable = true)
 |-- feature_flight_timestamp: timestamp (nullable = true)
 |-- feature_flight_quarter_name: string (nullable = false)
 |-- feature_departure_hour: integer (nullable = true)
 |-- feature_minutes_since_midnight: double (nullable = true)
 |-- feature_departure_quarter_day: integer (nullable = false)
 |-- feature_flight_month:

[FL_DATE: string, OP_CARRIER_AIRLINE_ID: int ... 24 more fields]

In [7]:
val temporalNumericalColumns = Seq(
  "feature_flight_timestamp",
  "feature_flight_year",
  "feature_flight_month",
  "feature_flight_quarter",
  "feature_flight_day_of_month",
  "feature_flight_day_of_week",
  "feature_flight_day_of_year",
  "feature_flight_week_of_year",
  "feature_departure_hour",
  "feature_departure_minute",
  "feature_departure_hour_decimal",
  "feature_departure_quarter_day",
  "feature_minutes_since_midnight"
)
val temporalCategoricalColumns = Seq(
  "feature_departure_quarter_name",
  "feature_departure_time_period"  
)

val temporalColumns = (temporalNumericalColumns ++ temporalCategoricalColumns).distinct

val df = withTemporalFeatures
  .select(temporalCategoricalColumns.head, temporalCategoricalColumns.tail: _*)
df.show(5)

val df2 = withTemporalFeatures
  .select(temporalNumericalColumns.head, temporalNumericalColumns.tail: _*)
df2.show(5)

val df3 = withTemporalFeatures
  .select(temporalColumns.head, temporalColumns.tail: _*)
df2.show(5)


+------------------------------+-----------------------------+
|feature_departure_quarter_name|feature_departure_time_period|
+------------------------------+-----------------------------+
|                       Morning|                 Late_Morning|
|                     Afternoon|               Late_Afternoon|
|                       Evening|                      Evening|
|                       Evening|                        Night|
|                     Afternoon|               Late_Afternoon|
+------------------------------+-----------------------------+
only showing top 5 rows

+------------------------+-------------------+--------------------+----------------------+---------------------------+--------------------------+--------------------------+---------------------------+----------------------+------------------------+------------------------------+-----------------------------+------------------------------+
|feature_flight_timestamp|feature_flight_year|feature_flight_month|

temporalNumericalColumns = List(feature_flight_timestamp, feature_flight_year, feature_flight_month, feature_flight_quarter, feature_flight_day_of_month, feature_flight_day_of_week, feature_flight_day_of_year, feature_flight_week_of_year, feature_departure_hour, feature_departure_minute, feature_departure_hour_decimal, feature_departure_quarter_day, feature_minutes_since_midnight)
temporalCategoricalColumns = List(feature_departure_quarter_name, feature_departure_time_period)
temporalColumns = List(feature_flight_timestamp, feature_flight_year, feature_flight_month, feature_flight_quarter, feature_flight_day_of_month, feature_flight_day_of_week, feature_flight_day_of_year, feature_flight_week_of_year, feature_departure_hour, feature_departure_minut...


List(feature_flight_timestamp, feature_flight_year, feature_flight_month, feature_flight_quarter, feature_flight_day_of_month, feature_flight_day_of_week, feature_flight_day_of_year, feature_flight_week_of_year, feature_departure_hour, feature_departure_minut...

In [8]:
import com.flightdelay.data.utils.DataQualityMetrics

val temporalfeatureMetric = DataQualityMetrics.metrics(df3)
temporalfeatureMetric.show()

temporalfeatureMetric = [name: string, origType: string ... 3 more fields]


+--------------------+-------------+-------+---------+----------------+
|                name|     origType|colType|compRatio|nbDistinctValues|
+--------------------+-------------+-------+---------+----------------+
|feature_flight_ti...|TimestampType|   date|      1.0|              31|
| feature_flight_year|  IntegerType|numeric|      1.0|               1|
|feature_flight_month|  IntegerType|numeric|      1.0|               1|
|feature_flight_qu...|  IntegerType|numeric|      1.0|               1|
|feature_flight_da...|  IntegerType|numeric|      1.0|              31|
|feature_flight_da...|  IntegerType|numeric|      1.0|               7|
|feature_flight_da...|  IntegerType|numeric|      1.0|              31|
|feature_flight_we...|  IntegerType|numeric|      1.0|               6|
|feature_departure...|  IntegerType|numeric|      1.0|              24|
|feature_departure...|  IntegerType|numeric|      1.0|              60|
|feature_departure...|   DoubleType|numeric|      1.0|          

[name: string, origType: string ... 3 more fields]

In [8]:
import com.flightdelay.data.preprocessing.FlightDataGenerator

val withFlightFeatures = FlightDataGenerator.addFlightCharacteristics(withTemporalFeatures)


Phase 2: Add Flight Characteristics
- Add feature_flight_unique_id 
- Add feature_distance_category (short, medium, long, very_long) 
- Add feature_distance_score 
- Add feature_is_likely_domestic 
- Add feature_carrier_hash 
- Add feature_route_id 
- Add feature_is_roundtrip_candidate 
Added Flight features: 7


withFlightFeatures = [FL_DATE: string, OP_CARRIER_AIRLINE_ID: int ... 31 more fields]


[FL_DATE: string, OP_CARRIER_AIRLINE_ID: int ... 31 more fields]

In [9]:
val filgthCarateristiquesColumns = Seq(
  "feature_flight_unique_id",
  "feature_distance_category",
  "feature_distance_score",
  "feature_is_likely_domestic",
  "feature_carrier_hash",
  "feature_route_id",
  "feature_is_roundtrip_candidate"
)

val df = withFlightFeatures
  .select(filgthCarateristiquesColumns.head, filgthCarateristiquesColumns.tail: _*)
df.show(5)

+------------------------+-------------------------+----------------------+--------------------------+--------------------+----------------+------------------------------+
|feature_flight_unique_id|feature_distance_category|feature_distance_score|feature_is_likely_domestic|feature_carrier_hash|feature_route_id|feature_is_roundtrip_candidate|
+------------------------+-------------------------+----------------------+--------------------------+--------------------+----------------+------------------------------+
|    2012-01-01_20366_...|                   medium|   0.26166666666666666|                         1|         -2080468873|     10397_11618|                             1|
|    2012-01-01_20366_...|                   medium|   0.23666666666666666|                         1|         -2080468873|     11618_13930|                             1|
|    2012-01-01_20366_...|                    short|                 0.135|                         1|         -2080468873|     10397_11617|

filgthCarateristiquesColumns = List(feature_flight_unique_id, feature_distance_category, feature_distance_score, feature_is_likely_domestic, feature_carrier_hash, feature_route_id, feature_is_roundtrip_candidate)
df = [feature_flight_unique_id: string, feature_distance_category: string ... 5 more fields]


[feature_flight_unique_id: string, feature_distance_category: string ... 5 more fields]

In [10]:
import com.flightdelay.data.utils.DataQualityMetrics

val fightFeaturesMetric = DataQualityMetrics.metrics(df)
fightFeaturesMetric.show()

fightFeaturesMetric = [name: string, origType: string ... 3 more fields]


+--------------------+-----------+-------+---------+----------------+
|                name|   origType|colType|compRatio|nbDistinctValues|
+--------------------+-----------+-------+---------+----------------+
|feature_flight_un...| StringType|textual|      1.0|          477960|
|feature_distance_...| StringType|textual|      1.0|               4|
|feature_distance_...| DoubleType|numeric|      1.0|             417|
|feature_is_likely...|IntegerType|numeric|      1.0|               2|
|feature_carrier_hash|IntegerType|numeric|      1.0|              15|
|    feature_route_id| StringType|textual|      1.0|            2033|
|feature_is_roundt...|IntegerType|numeric|      1.0|               2|
+--------------------+-----------+-------+---------+----------------+



[name: string, origType: string ... 3 more fields]

In [11]:
val filgthCarateristiquesNumericColumns = Seq(
  "feature_distance_score",
  "feature_is_likely_domestic",
  "feature_carrier_hash",
  "feature_is_roundtrip_candidate"
)

val filgthCarateristiquesCategoricalColumns = Seq(
  "feature_flight_unique_id",
  "feature_distance_category",
  "feature_route_id"
)

filgthCarateristiquesNumericColumns = List(feature_distance_score, feature_is_likely_domestic, feature_carrier_hash, feature_is_roundtrip_candidate)
filgthCarateristiquesCategoricalColumns = List(feature_flight_unique_id, feature_distance_category, feature_route_id)


List(feature_flight_unique_id, feature_distance_category, feature_route_id)

In [12]:
import com.flightdelay.data.preprocessing.FlightDataGenerator

val withPeriodIndicators = FlightDataGenerator.addPeriodIndicators(withFlightFeatures)


Phase 3: Add Period <indicator
- Add feature_is_weekend, feature_is_friday, feature_is_monday
- Add feature_is_summer, feature_is_winter, feature_is_spring, feature_is_fall 
- Add feature_is_holiday_season (approximative)
- Add feature_is_early_morning 
- Add feature_is_morning_rush 
- Add feature_is_business_hours 
- Add feature_is_evening_rush 
- Add feature_is_night_flight 
- Add feature_is_month_start 
- Add feature_is_month_end 
- Add feature_is_extended_weekend 


withPeriodIndicators = [FL_DATE: string, OP_CARRIER_AIRLINE_ID: int ... 47 more fields]


Added Flight features: 16


[FL_DATE: string, OP_CARRIER_AIRLINE_ID: int ... 47 more fields]

In [13]:
val filgthPeriodIndicatorsColumns = Seq(
  "feature_is_weekend",
  "feature_is_friday",
  "feature_is_monday",
  "feature_is_summer",
  "feature_is_winter",
  "feature_is_spring",
  "feature_is_fall",
  "feature_is_holiday_season",
  "feature_is_early_morning", 
  "feature_is_morning_rush", 
  "feature_is_business_hours", 
  "feature_is_evening_rush",
  "feature_is_night_flight", 
  "feature_is_month_start", 
  "feature_is_month_end", 
  "feature_is_extended_weekend"     
)

val df = withPeriodIndicators
  .select(filgthPeriodIndicatorsColumns.head, filgthPeriodIndicatorsColumns.tail: _*)
df.show(5)

+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+---------------+-------------------------+------------------------+-----------------------+-------------------------+-----------------------+-----------------------+----------------------+--------------------+---------------------------+
|feature_is_weekend|feature_is_friday|feature_is_monday|feature_is_summer|feature_is_winter|feature_is_spring|feature_is_fall|feature_is_holiday_season|feature_is_early_morning|feature_is_morning_rush|feature_is_business_hours|feature_is_evening_rush|feature_is_night_flight|feature_is_month_start|feature_is_month_end|feature_is_extended_weekend|
+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+---------------+-------------------------+------------------------+-----------------------+-------------------------+-----------------------+-----------------------+----------------------+------

filgthPeriodIndicatorsColumns = List(feature_is_weekend, feature_is_friday, feature_is_monday, feature_is_summer, feature_is_winter, feature_is_spring, feature_is_fall, feature_is_holiday_season, feature_is_early_morning, feature_is_morning_rush, feature_is_business_hours, feature_is_evening_rush, feature_is_night_flight, feature_is_month_start, feature_is_month_end, feature_is_extended_weekend)
df = [feature_is_weekend: int, feature_is_friday: int ... 14 more fields]


[feature_is_weekend: int, feature_is_friday: int ... 14 more fields]

In [14]:
import com.flightdelay.data.preprocessing.FlightDataGenerator

val withGeographicFeatures = FlightDataGenerator.addGeographicFeatures(withPeriodIndicators)


Phase 4: Add Geographical Features
- Add feature_origin_is_major_hub (10397, 11298, 12266, 13930, 14107, 14771, 15016  // Principaux hubs US)
- Add feature_dest_is_major_hub  (10397, 11298, 12266, 13930, 14107, 14771, 15016  // Principaux hubs US)
- Add feature_is_hub_to_hub
- Add feature_flight_quarter
- Add feature_origin_complexity_score
- Add feature_dest_complexity_score
- Add feature_timezone_diff_proxy
- Add feature_flight_week_of_year
- Add feature_is_eastbound
- Add feature_is_westbound
Added Flight features: 8


withGeographicFeatures = [FL_DATE: string, OP_CARRIER_AIRLINE_ID: int ... 55 more fields]


[FL_DATE: string, OP_CARRIER_AIRLINE_ID: int ... 55 more fields]

In [15]:
val filgthGeographicFeaturesColumns = Seq(
  "feature_origin_is_major_hub",
  "feature_dest_is_major_hub",
  "feature_is_hub_to_hub",
  "feature_flight_quarter",
  "feature_origin_complexity_score",
  "feature_dest_complexity_score",
  "feature_timezone_diff_proxy",
  "feature_flight_week_of_year",
  "feature_is_eastbound",
  "feature_is_westbound"
)

val df = withGeographicFeatures
  .select(filgthGeographicFeaturesColumns.head, filgthGeographicFeaturesColumns.tail: _*)
df.show(5)

+---------------------------+-------------------------+---------------------+----------------------+-------------------------------+-----------------------------+---------------------------+---------------------------+--------------------+--------------------+
|feature_origin_is_major_hub|feature_dest_is_major_hub|feature_is_hub_to_hub|feature_flight_quarter|feature_origin_complexity_score|feature_dest_complexity_score|feature_timezone_diff_proxy|feature_flight_week_of_year|feature_is_eastbound|feature_is_westbound|
+---------------------------+-------------------------+---------------------+----------------------+-------------------------------+-----------------------------+---------------------------+---------------------------+--------------------+--------------------+
|                          0|                        1|                    0|                     1|                           0.18|                         0.97|                          1|                         52

filgthGeographicFeaturesColumns = List(feature_origin_is_major_hub, feature_dest_is_major_hub, feature_is_hub_to_hub, feature_flight_quarter, feature_origin_complexity_score, feature_dest_complexity_score, feature_timezone_diff_proxy, feature_flight_week_of_year, feature_is_eastbound, feature_is_westbound)
df = [feature_origin_is_major_hub: int, feature_dest_is_major_hub: int ... 8 more fields]


[feature_origin_is_major_hub: int, feature_dest_is_major_hub: int ... 8 more fields]

In [16]:
import com.flightdelay.data.preprocessing.FlightDataGenerator

val withAggregatedFeatures = FlightDataGenerator.addAggregatedFeatures(withGeographicFeatures)


Phase 5 : Add Aggregated Features
- Add feature_flights_on_route
- Add feature_carrier_flight_count
- Add feature_origin_airport_traffic
- Add feature_route_popularity_score
- Add feature_carrier_size_category


withAggregatedFeatures = [FL_DATE: string, OP_CARRIER_AIRLINE_ID: int ... 60 more fields]


Added Flight features: 5


[FL_DATE: string, OP_CARRIER_AIRLINE_ID: int ... 60 more fields]

In [17]:
val filgthAggregatedFeaturesColumns = Seq(
  "feature_flights_on_route",
  "feature_carrier_flight_count",
  "feature_origin_airport_traffic",
  "feature_route_popularity_score",
  "feature_carrier_size_category"
)

val df = withAggregatedFeatures
  .select(filgthAggregatedFeaturesColumns.head, filgthAggregatedFeaturesColumns.tail: _*)
df.show(5)

+------------------------+----------------------------+------------------------------+------------------------------+-----------------------------+
|feature_flights_on_route|feature_carrier_flight_count|feature_origin_airport_traffic|feature_route_popularity_score|feature_carrier_size_category|
+------------------------+----------------------------+------------------------------+------------------------------+-----------------------------+
|                      59|                       33946|                           245|                        medium|                        major|
|                      59|                       33946|                           245|                        medium|                        major|
|                      59|                       33946|                           245|                        medium|                        major|
|                      59|                       33946|                           245|                        me

filgthAggregatedFeaturesColumns = List(feature_flights_on_route, feature_carrier_flight_count, feature_origin_airport_traffic, feature_route_popularity_score, feature_carrier_size_category)
df = [feature_flights_on_route: bigint, feature_carrier_flight_count: bigint ... 3 more fields]


[feature_flights_on_route: bigint, feature_carrier_flight_count: bigint ... 3 more fields]