# Weather Dataset Preprocessing and feature Engineering

In [1]:
%AddJar file:///home/jovyan/work/apps/Emiasd-Flight-Data-Analysis.jar

Starting download from file:///home/jovyan/work/apps/Emiasd-Flight-Data-Analysis.jar
Finished download of Emiasd-Flight-Data-Analysis.jar
Using cached version of Emiasd-Flight-Data-Analysis.jar


In [2]:
import org.apache.spark.sql.SparkSession
import com.flightdelay.config.{AppConfiguration, ConfigurationLoader}
import com.flightdelay.data.loaders.FlightDataLoader

//Env Configuration
val args: Array[String] = Array("jupyter")

val spark = SparkSession.builder()
  .config(sc.getConf)
  .getOrCreate()

// Rendre la session Spark implicite
implicit val session = spark
implicit val configuration: AppConfiguration = ConfigurationLoader.loadConfiguration(args)

val rawFlightsPath = s"${configuration.common.output.basePath}/common/data/processed_flights.parquet"
val flightsDF = spark.read.parquet(rawFlightsPath)

args = Array(jupyter)
spark = org.apache.spark.sql.SparkSession@2f55094a
session = org.apache.spark.sql.SparkSession@2f55094a
configuration = AppConfiguration(local,CommonConfig(42,DataConfig(/home/jovyan/work/data,FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Flights/201201.csv),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Weather/201201hourly.txt),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/wban_airport_timezone.csv)),OutputConfig(/home/jovyan/work/output,FileConfig(/home/jovyan/work/output/data),FileCo...


AppConfiguration(local,CommonConfig(42,DataConfig(/home/jovyan/work/data,FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Flights/201201.csv),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Weather/201201hourly.txt),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/wban_airport_timezone.csv)),OutputConfig(/home/jovyan/work/output,FileConfig(/home/jovyan/work/output/data),FileCo...

In [3]:
flightsDF.count()

6680864

In [13]:
flightsDF.printSchema

root
 |-- FL_DATE: date (nullable = true)
 |-- OP_CARRIER_AIRLINE_ID: integer (nullable = true)
 |-- OP_CARRIER_FL_NUM: integer (nullable = true)
 |-- ORIGIN_AIRPORT_ID: integer (nullable = true)
 |-- DEST_AIRPORT_ID: integer (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- ARR_DELAY_NEW: double (nullable = true)
 |-- CRS_ELAPSED_TIME: double (nullable = true)
 |-- WEATHER_DELAY: double (nullable = true)
 |-- NAS_DELAY: double (nullable = true)
 |-- ORIGIN_WBAN: string (nullable = true)
 |-- ORIGIN_TIMEZONE: integer (nullable = true)
 |-- DEST_WBAN: string (nullable = true)
 |-- DEST_TIMEZONE: integer (nullable = true)
 |-- UTC_CRS_DEP_TIME: string (nullable = true)
 |-- UTC_FL_DATE: date (nullable = true)
 |-- UTC_ARR_TIME: string (nullable = true)
 |-- UTC_ARR_DATE: date (nullable = true)
 |-- CRS_ARR_TIME: string (nullable = true)
 |-- CRS_ARR_DATE: date (nullable = true)
 |-- feature_arrival_hour: integer (nullable = true)
 |-- feature_utc_arrival_hour: integer (

In [14]:
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.ml.feature.{StringIndexer, OneHotEncoder, VectorAssembler}
import scala.util.Try

In [15]:
flightsDF.count()

6680864

# feature_flight_year, feature_flight_month

In [23]:
import org.apache.spark.sql.functions._

val delaysByYearMonth =
  flightsDF
    .groupBy("feature_flight_year", "feature_flight_month")
    .agg(
      count(lit(1)).as("total_flights"),
      sum(col("label_is_delayed_15min").cast("long")).as("delayed_flights")
    )
    .orderBy(col("feature_flight_year"), col("feature_flight_month"))

delaysByYearMonth.show(100, truncate = false)

+-------------------+--------------------+-------------+---------------+
|feature_flight_year|feature_flight_month|total_flights|delayed_flights|
+-------------------+--------------------+-------------+---------------+
|2012               |1                   |161441       |67314          |
|2012               |2                   |147540       |55616          |
|2012               |3                   |180437       |81774          |
|2012               |4                   |160444       |60248          |
|2012               |5                   |175965       |76465          |
|2012               |6                   |187409       |89829          |
|2012               |7                   |210827       |115081         |
|2012               |8                   |197975       |99103          |
|2012               |9                   |167554       |73446          |
|2012               |10                  |179675       |82520          |
|2012               |11                  |156825   

delaysByYearMonth = [feature_flight_year: int, feature_flight_month: int ... 2 more fields]


[feature_flight_year: int, feature_flight_month: int ... 2 more fields]

# feature_departure_hour_rounded

In [10]:
import org.apache.spark.sql.functions._

val delaysByYearMonth =
  flightsDF
    .groupBy("feature_departure_hour_rounded")
    .agg(
      count(lit(1)).as("total_flights"),
      sum(col("label_is_delayed_15min").cast("long")).as("delayed_flights")
    )
    .withColumn(
      "delay_rate_percent",
      round(col("delayed_flights") / col("total_flights") * 100, 2)
    )
    .orderBy(col("feature_departure_hour_rounded"))

delaysByYearMonth.show(100, truncate = false)

+------------------------------+-------------+---------------+------------------+
|feature_departure_hour_rounded|total_flights|delayed_flights|delay_rate_percent|
+------------------------------+-------------+---------------+------------------+
|0                             |6182         |3388           |54.8              |
|100                           |2329         |1000           |42.94             |
|200                           |598          |267            |44.65             |
|300                           |213          |89             |41.78             |
|400                           |90           |46             |51.11             |
|500                           |3744         |960            |25.64             |
|600                           |78261        |22376          |28.59             |
|700                           |102107       |32599          |31.93             |
|800                           |106678       |39691          |37.21             |
|900            

delaysByYearMonth = [feature_departure_hour_rounded: bigint, total_flights: bigint ... 2 more fields]


[feature_departure_hour_rounded: bigint, total_flights: bigint ... 2 more fields]

# feature_flight_day_of_week

In [11]:
import org.apache.spark.sql.functions._

val delaysByYearMonth =
  flightsDF
    .groupBy("feature_flight_day_of_week")
    .agg(
      count(lit(1)).as("total_flights"),
      sum(col("label_is_delayed_15min").cast("long")).as("delayed_flights")
    )
    .withColumn(
      "delay_rate_percent",
      round(col("delayed_flights") / col("total_flights") * 100, 2)
    )
    .orderBy(col("feature_flight_day_of_week"))

delaysByYearMonth.show(100, truncate = false)

+--------------------------+-------------+---------------+------------------+
|feature_flight_day_of_week|total_flights|delayed_flights|delay_rate_percent|
+--------------------------+-------------+---------------+------------------+
|1                         |273298       |135352         |49.53             |
|2                         |291734       |147433         |50.54             |
|3                         |266570       |125973         |47.26             |
|4                         |270726       |129345         |47.78             |
|5                         |297300       |156540         |52.65             |
|6                         |304749       |165419         |54.28             |
|7                         |223230       |103350         |46.3              |
+--------------------------+-------------+---------------+------------------+



delaysByYearMonth = [feature_flight_day_of_week: int, total_flights: bigint ... 2 more fields]


[feature_flight_day_of_week: int, total_flights: bigint ... 2 more fields]

# feature_flight_day_of_month

In [12]:
import org.apache.spark.sql.functions._

val delaysByYearMonth =
  flightsDF
    .groupBy("feature_flight_day_of_month")
    .agg(
      count(lit(1)).as("total_flights"),
      sum(col("label_is_delayed_15min").cast("long")).as("delayed_flights")
    )
    .withColumn(
      "delay_rate_percent",
      round(col("delayed_flights") / col("total_flights") * 100, 2)
    )
    .orderBy(col("feature_flight_day_of_month"))

delaysByYearMonth.show(100, truncate = false)

delaysByYearMonth = [feature_flight_day_of_month: int, total_flights: bigint ... 2 more fields]


+---------------------------+-------------+---------------+------------------+
|feature_flight_day_of_month|total_flights|delayed_flights|delay_rate_percent|
+---------------------------+-------------+---------------+------------------+
|1                          |63167        |32243          |51.04             |
|2                          |60678        |28491          |46.95             |
|3                          |62384        |31057          |49.78             |
|4                          |59765        |28159          |47.12             |
|5                          |59412        |27036          |45.51             |
|6                          |57738        |24615          |42.63             |
|7                          |58557        |26139          |44.64             |
|8                          |61322        |29588          |48.25             |
|9                          |64820        |32696          |50.44             |
|10                         |64912        |33597    

[feature_flight_day_of_month: int, total_flights: bigint ... 2 more fields]

# feature_flight_day_of_year

In [14]:
import org.apache.spark.sql.functions._

val delaysByYearMonth =
  flightsDF
    .groupBy("feature_flight_day_of_year")
    .agg(
      count(lit(1)).as("total_flights"),
      sum(col("label_is_delayed_15min").cast("long")).as("delayed_flights")
    )
    .withColumn(
      "delay_rate_percent",
      round(col("delayed_flights") / col("total_flights") * 100, 2)
    )
    .orderBy(col("delay_rate_percent").desc)

delaysByYearMonth.show(100, truncate = false)

+--------------------------+-------------+---------------+------------------+
|feature_flight_day_of_year|total_flights|delayed_flights|delay_rate_percent|
+--------------------------+-------------+---------------+------------------+
|361                       |9288         |7527           |81.04             |
|356                       |8696         |6727           |77.36             |
|362                       |8821         |6803           |77.12             |
|223                       |8290         |6065           |73.16             |
|355                       |7829         |5695           |72.74             |
|364                       |6495         |4718           |72.64             |
|21                        |6025         |4374           |72.6              |
|55                        |7764         |5563           |71.65             |
|201                       |7957         |5659           |71.12             |
|208                       |7883         |5538           |70.25 

delaysByYearMonth = [feature_flight_day_of_year: int, total_flights: bigint ... 2 more fields]


[feature_flight_day_of_year: int, total_flights: bigint ... 2 more fields]

# feature_flight_week_of_year

In [15]:
import org.apache.spark.sql.functions._

val delaysByYearMonth =
  flightsDF
    .groupBy("feature_flight_week_of_year")
    .agg(
      count(lit(1)).as("total_flights"),
      sum(col("label_is_delayed_15min").cast("long")).as("delayed_flights")
    )
    .withColumn(
      "delay_rate_percent",
      round(col("delayed_flights") / col("total_flights") * 100, 2)
    )
    .orderBy(col("delay_rate_percent").desc)

delaysByYearMonth.show(100, truncate = false)

+---------------------------+-------------+---------------+------------------+
|feature_flight_week_of_year|total_flights|delayed_flights|delay_rate_percent|
+---------------------------+-------------+---------------+------------------+
|52                         |52340        |35956          |68.7              |
|51                         |47054        |31185          |66.27             |
|28                         |46947        |29160          |62.11             |
|29                         |46961        |28921          |61.59             |
|30                         |45910        |27775          |60.5              |
|31                         |45438        |26959          |59.33             |
|32                         |44354        |25804          |58.18             |
|33                         |41919        |23832          |56.85             |
|26                         |43491        |24602          |56.57             |
|3                          |37162        |20633    

delaysByYearMonth = [feature_flight_week_of_year: int, total_flights: bigint ... 2 more fields]


[feature_flight_week_of_year: int, total_flights: bigint ... 2 more fields]

# AvgDelayFeature

In [10]:
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window

object AvgDelayFeature {

  /**
   * Version optimisée du calcul de l'avg_delay
   * Optimisations:
   * - Window functions au lieu de self-join
   * - Calcul des timestamps une seule fois
   * - Partitionnement intelligent
   * - Cache des données intermédiaires
   */
  def enrichFlightsWithAvgDelay(flightData: DataFrame)(implicit spark: SparkSession): DataFrame = {
    
    import spark.implicits._
    
    // Étape 1 : Préparation une seule fois avec cache
    val flightWithTimestamps = flightData
      .withColumn("utc_departure_timestamp", 
        to_timestamp(
          concat(
            col("UTC_FL_DATE"), 
            lit(" "), 
            lpad(col("feature_utc_departure_hour_rounded"), 2, "0"),
            lit(":00:00")
          )
        )
      )
      .withColumn("6_hour_before_departure", 
        col("utc_departure_timestamp") - expr("INTERVAL 6 HOURS")
      )
      .withColumn("2_hour_before_departure", 
        col("utc_departure_timestamp") - expr("INTERVAL 2 HOURS")
      )
      .repartition(col("ORIGIN_AIRPORT_ID")) // Partitionnement par aéroport
      .cache() // Cache pour éviter recalcul
    
    // Étape 2 : Utiliser Window Functions (plus efficace que self-join)
    // Créer une fenêtre glissante par aéroport
    val windowSpec = Window
      .partitionBy("ORIGIN_AIRPORT_ID")
      .orderBy(col("utc_departure_timestamp").cast("long"))
      .rangeBetween(-6 * 3600, -2 * 3600) // En secondes: -6h à -2h
    
    // Étape 3 : Calculer les statistiques avec window functions
    val enrichedFlights = flightWithTimestamps
      .withColumn("avg_delay", 
        avg(col("label_arr_delay_filled")).over(windowSpec)
      )
      .withColumn("num_previous_flights", 
        count(col("label_arr_delay_filled")).over(windowSpec)
      )
      .withColumn("proportion_delayed_15min", 
        avg(col("label_is_delayed_15min").cast("double")).over(windowSpec)
      )
      .withColumn("stddev_delay", 
        stddev(col("label_arr_delay_filled")).over(windowSpec)
      )
      .withColumn("max_delay", 
        max(col("label_arr_delay_filled")).over(windowSpec)
      )
      .withColumn("min_delay", 
        min(col("label_arr_delay_filled")).over(windowSpec)
      )
      // Remplacer les null par 0 pour les vols sans historique
      .na.fill(0.0, Seq("avg_delay", "proportion_delayed_15min", "stddev_delay", "max_delay", "min_delay"))
      .na.fill(0, Seq("num_previous_flights"))
      // Nettoyer les colonnes intermédiaires
      .drop("6_hour_before_departure", "2_hour_before_departure")
    
    // Unpersist pour libérer la mémoire
    flightWithTimestamps.unpersist()
    
    enrichedFlights
  }
  
  /**
   * Version alternative avec broadcast join si peu d'aéroports
   * Utilise un self-join optimisé avec broadcast
   */
  def enrichFlightsWithAvgDelayBroadcast(flightData: DataFrame)(implicit spark: SparkSession): DataFrame = {
    
    import spark.implicits._
    import org.apache.spark.sql.functions.broadcast
    
    // Préparation des timestamps
    val flightWithTimestamps = flightData
      .withColumn("utc_departure_timestamp", 
        to_timestamp(
          concat(
            col("UTC_FL_DATE"), 
            lit(" "), 
            lpad(col("feature_utc_departure_hour_rounded"), 2, "0"),
            lit(":00:00")
          )
        )
      )
      .withColumn("dep_ts_long", col("utc_departure_timestamp").cast("long"))
      .repartition(200, col("ORIGIN_AIRPORT_ID")) // Augmenter le parallélisme
      .cache()
    
    // Créer un DataFrame agrégé plus petit (candidat pour broadcast)
    val delayStats = flightWithTimestamps
      .groupBy("ORIGIN_AIRPORT_ID", "dep_ts_long")
      .agg(
        avg("label_arr_delay_filled").alias("instant_avg_delay"),
        count("*").alias("flights_at_this_time"),
        avg(col("label_is_delayed_15min").cast("double")).alias("instant_proportion_delayed")
      )
    
    // Self-join optimisé avec range condition
    val enrichedFlights = flightWithTimestamps.as("f1")
      .join(
        delayStats.as("f2"),
        col("f1.ORIGIN_AIRPORT_ID") === col("f2.ORIGIN_AIRPORT_ID") &&
        col("f2.dep_ts_long") >= col("f1.dep_ts_long") - 6 * 3600 &&
        col("f2.dep_ts_long") <= col("f1.dep_ts_long") - 2 * 3600,
        "left"
      )
      .groupBy("f1.*")
      .agg(
        avg("f2.instant_avg_delay").alias("avg_delay"),
        sum("f2.flights_at_this_time").alias("num_previous_flights"),
        avg("f2.instant_proportion_delayed").alias("proportion_delayed_15min")
      )
      .na.fill(0.0, Seq("avg_delay", "proportion_delayed_15min"))
      .na.fill(0, Seq("num_previous_flights"))
      .drop("dep_ts_long")
    
    flightWithTimestamps.unpersist()
    
    enrichedFlights
  }
  
}

defined object AvgDelayFeature


In [17]:
val startTime = System.nanoTime()
var avgDelayEnrichedFlightsDF = AvgDelayFeature.enrichFlightsWithAvgDelayBroadcast(flightsDF)
val endTime = System.nanoTime()
val durationMs = (endTime - startTime) / 1000000.0

println(f"Temps d'exécution: $durationMs%.2f ms")
println(f"Temps d'exécution: ${durationMs/1000}%.2f secondes")

avgDelayEnrichedFlightsDF.select(
  "FL_DATE", 
  "label_is_delayed_15min",
  "utc_departure_timestamp",
  "avg_delay",
  "num_previous_flights",
  "proportion_delayed_15min"/**,
  "stddev_delay",
  "max_delay",
  "min_delay"                  **/             
).show(100)

org.apache.spark.sql.AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `f1`.`*` cannot be resolved. Did you mean one of the following? [`FL_DATE`, `OP_CARRIER_AIRLINE_ID`, `OP_CARRIER_FL_NUM`, `ORIGIN_AIRPORT_ID`, `DEST_AIRPORT_ID`, `CRS_DEP_TIME`, `ARR_DELAY_NEW`, `CRS_ELAPSED_TIME`, `WEATHER_DELAY`, `NAS_DELAY`, `ORIGIN_WBAN`, `ORIGIN_TIMEZONE`, `DEST_WBAN`, `DEST_TIMEZONE`, `UTC_CRS_DEP_TIME`, `UTC_FL_DATE`, `UTC_ARR_TIME`, `UTC_ARR_DATE`, `CRS_ARR_TIME`, `CRS_ARR_DATE`, `feature_arrival_hour`, `feature_utc_arrival_hour`, `feature_utc_arrival_hour_rounded`, `feature_utc_arrival_date`, `feature_crosses_midnight_local`, `feature_crosses_midnight_utc`, `feature_flight_days_span`, `feature_timezone_difference`, `feature_flies_eastward`, `feature_flies_westward`, `feature_departure_minute`, `feature_arrival_minutes_total`, `feature_crosses_midnight`, `feature_flight_day_of_week`, `feature_departure_hour_decimal`, `feature_flight_timestamp`, `feature_utc_departure_hour`, `feature_flight_quarter_name`, `feature_utc_departure_hour_decimal`, `feature_departure_hour`, `feature_minutes_since_midnight`, `feature_departure_quarter_day`, `feature_flight_month`, `feature_utc_departure_hour_rounded`, `feature_arrival_minute`, `feature_flight_year`, `feature_flight_day_of_year`, `feature_arrival_time`, `feature_flight_quarter`, `feature_arrival_hour_decimal`, `feature_flight_week_of_year`, `feature_flight_day_of_month`, `feature_departure_quarter_name`, `feature_departure_time_period`, `feature_departure_hour_rounded`, `feature_departure_minutes_total`, `feature_arrival_hour_rounded`, `feature_arrival_date`, `feature_distance_score`, `feature_distance_category`, `feature_carrier_hash`, `feature_route_id`, `feature_is_likely_domestic`, `feature_is_roundtrip_candidate`, `feature_flight_unique_id`, `feature_is_summer`, `feature_is_monday`, `feature_is_evening_rush`, `feature_is_month_start`, `feature_is_extended_weekend`, `feature_is_winter`, `feature_is_friday`, `feature_is_spring`, `feature_is_weekend`, `feature_is_holiday_season`, `feature_is_fall`, `feature_is_early_morning`, `feature_is_morning_rush`, `feature_is_business_hours`, `feature_is_night_flight`, `feature_is_month_end`, `feature_origin_complexity_score`, `feature_is_westbound`, `feature_dest_complexity_score`, `feature_dest_is_major_hub`, `feature_is_eastbound`, `feature_timezone_diff_proxy`, `feature_origin_is_major_hub`, `feature_is_hub_to_hub`, `feature_flights_on_route`, `feature_carrier_flight_count`, `feature_origin_airport_traffic`, `feature_route_popularity_score`, `feature_carrier_size_category`, `AIRCRAFT_ID`, `PREV_AIRCRAFT_ARR_DELAY`, `IS_PREV_AIRCRAFT_LATE`, `label_arr_delay_filled`, `label_nas_delay_filled`, `label_weather_delay_filled`, `label_weather_delay_was_missing`, `label_nas_delay_was_missing`, `label_arr_delay_was_missing`, `label_is_early`, `label_total_weather_nas_delay`, `label_is_delayed_15min`, `label_has_nas_delay`, `label_is_delayed_30min`, `label_is_delayed_60min`, `label_is_delayed_45min`, `label_is_delayed_90min`, `label_has_weather_delay`, `label_is_on_time`, `label_has_any_weather_nas_delay`, `utc_departure_timestamp`, `dep_ts_long`, `ORIGIN_AIRPORT_ID`, `dep_ts_long`, `instant_avg_delay`, `flights_at_this_time`, `instant_proportion_delayed`].