# Data Jointure V2

## Configuration

In [1]:
%AddJar file:///home/jovyan/work/apps/Emiasd-Flight-Data-Analysis.jar

Starting download from file:///home/jovyan/work/apps/Emiasd-Flight-Data-Analysis.jar
Finished download of Emiasd-Flight-Data-Analysis.jar
Using cached version of Emiasd-Flight-Data-Analysis.jar


In [2]:
import org.apache.spark.sql.SparkSession
import com.flightdelay.config.{AppConfiguration, ConfigurationLoader, ExperimentConfig}
import com.flightdelay.data.loaders.FlightDataLoader

// Env Configuration
val args: Array[String] = Array("jupyter")
implicit val configuration: AppConfiguration = ConfigurationLoader.loadConfiguration(args)
implicit val experiment: ExperimentConfig = configuration.experiments(0)

val spark = SparkSession.builder()
  .config(sc.getConf)
  .config("spark.eventLog.enabled", "true")
  .config("spark.eventLog.dir", "file:///home/jovyan/work/spark-events")
  .getOrCreate()

// Rendre la session Spark implicite
implicit val session = spark



args = Array(jupyter)
configuration = AppConfiguration(local,CommonConfig(42,true,debug,false,false,DataConfig(/home/jovyan/work/data,FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Flights/201201*.csv),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Weather/20101*.txt),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/wban_airport_timezone.csv)),OutputConfig(/home/jovyan/work/output,FileConfig(/home/jovyan/work/output/data),FileConfig(/home/jovyan/work/output/model)),MLFlowConfig(false,http://localhost:5555)),Stream(ExperimentConfig(Experience-local,Baseline Random ...


AppConfiguration(local,CommonConfig(42,true,debug,false,false,DataConfig(/home/jovyan/work/data,FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Flights/201201*.csv),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Weather/20101*.txt),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/wban_airport_timezone.csv)),OutputConfig(/home/jovyan/work/output,FileConfig(/home/jovyan/work/output/data),FileConfig(/home/jovyan/work/output/model)),MLFlowConfig(false,http://localhost:5555)),Stream(ExperimentConfig(Experience-local,Baseline Random ...

## Chargement des données

In [3]:
val flightDFPath = s"${configuration.common.output.basePath}/common/data/processed_flights.parquet"
val flightDF = spark.read.parquet(flightDFPath)

println("Flight DF Count: ", flightDF.count())

(Flight DF Count: ,3908461)


flightDFPath = /home/jovyan/work/output/common/data/processed_flights.parquet
flightDF = [OP_CARRIER_AIRLINE_ID: int, DEST_AIRPORT_ID: int ... 28 more fields]


[OP_CARRIER_AIRLINE_ID: int, DEST_AIRPORT_ID: int ... 28 more fields]

In [4]:
val weatherDFPath = s"${configuration.common.output.basePath}/common/data/processed_weather.parquet"
val weatherDF = spark.read.parquet(weatherDFPath)

println("Weather DF Count: ", weatherDF.count())

weatherDFPath = /home/jovyan/work/output/common/data/processed_weather.parquet
weatherDF = [WindDirection: double, PressureChange: double ... 63 more fields]


(Weather DF Count: ,1549320)


[WindDirection: double, PressureChange: double ... 63 more fields]

## Jointure

In [5]:
val flightDF_mCovered = flightDF
val weatherDF_pruned = weatherDF

println("Flight for join count ->", flightDF_mCovered.count())
println("Weather for join count ->", weatherDF_pruned.count())

(Flight for join count ->,3908461)


flightDF_mCovered = [OP_CARRIER_AIRLINE_ID: int, DEST_AIRPORT_ID: int ... 28 more fields]
weatherDF_pruned = [WindDirection: double, PressureChange: double ... 63 more fields]


(Weather for join count ->,1549320)


[WindDirection: double, PressureChange: double ... 63 more fields]

In [7]:
// =====================================================================
// Flight × Weather en DataFrame (Map → Hash partition → Reduce)
// - Fenêtre 12h avant départ (Wo_*) et 12h avant arrivée (Wd_*)
// - Gestion veille via relHour (duplication J et J+1)
// - Un seul job global avec Metrics.withJob
// =====================================================================
import org.apache.spark.sql.{DataFrame, Row, Column}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

// ------------------------
// 0) Objet Metrics (inchangé)
// ------------------------
object Metrics {
  def withJob[T](id: String, desc: String)(body: => T): T = {
    val sc = spark.sparkContext
    sc.setJobGroup(id, desc, interruptOnCancel = false)
    val t0 = System.nanoTime()
    val res = body
    val dtMs = (System.nanoTime() - t0) / 1e6
    println(f"[METRIC][$id] $desc took ${dtMs}%.2f ms")
    sc.clearJobGroup()
    res
  }
  import org.apache.spark.storage.StorageLevel
  def persistCount(id: String, desc: String, rdd: org.apache.spark.rdd.RDD[_], lvl: StorageLevel = StorageLevel.MEMORY_ONLY): Long = {
    rdd.persist(lvl); val n = withJob(id, desc) { rdd.count() }
    println(s"[METRIC][$id] records=$n partitions=${rdd.getNumPartitions}"); n
  }
  def persistCountDF(id: String, desc: String, df: DataFrame, lvl: StorageLevel = StorageLevel.MEMORY_ONLY): Long = {
    df.persist(lvl); val n = withJob(id, desc) { df.count() }
    println(s"[METRIC][$id] records=$n partitions=${df.rdd.getNumPartitions}"); n
  }
}

// =====================================================================
// 1) Jointure complète exécutée sous un SEUL job
// =====================================================================


// ------------------------
// Paramètres de partitions "reducers"
// ------------------------

Metrics.withJob("flight-joins-df2", "Build flights joins DataFrame2") {
    // =====================================================================
    // Jointure Flight × Weather la plus fidèle possible à Belcastro et al.
    // - Implémentation DataFrame, Map/Reduce-like
    // - get_hourly_observations respecté via pré-calcul des buckets horaires
    // =====================================================================
    
    import org.apache.spark.sql.{DataFrame, Column}
    import org.apache.spark.sql.functions._
    import org.apache.spark.sql.types._
    import org.apache.spark.sql.expressions.Window
    
    // ---------------------------------------------------------------------
    // 0) Helpers communs
    // ---------------------------------------------------------------------
    
    // HHMM (ou "HH:MM") -> heure [0..23] (floor à l'heure)
    def hhmmHourCol(c: Column): Column = {
      val s  = regexp_replace(c.cast("string"), ":", "")
      val p4 = lpad(s, 4, "0")
      (substring(p4, 1, 2).cast("int") % 24)
    }
    
    // (date, HHMM) -> timestamp "yyyy-MM-dd HH:mm"
    def dateTimeFromHHMM(dateCol: Column, hhmmCol: Column): Column = {
      val s  = regexp_replace(hhmmCol.cast("string"), ":", "")
      val p4 = lpad(s, 4, "0")
      val hh = substring(p4, 1, 2)
      val mm = substring(p4, 3, 2)
    
      val tsStr = concat(
        date_format(dateCol, "yyyy-MM-dd"), lit(" "),
        hh, lit(":"), mm
      )
      to_timestamp(tsStr, "yyyy-MM-dd HH:mm")
    }
    
    // Construit le timestamp théorique du vol à partir (DATE, HHMM)
    def flightTsCol(dateCol: Column, hhmmCol: Column): Column =
      dateTimeFromHHMM(dateCol, hhmmCol)
    
    // ---------------------------------------------------------------------
    // 1) Préparation des DataFrames de base (Map)
    // ---------------------------------------------------------------------
    
    val flightsBase = flightDF_mCovered.select(
      col("feature_flight_unique_id").as("flightId"),
      col("ORIGIN_WBAN"),
      col("DEST_WBAN"),
      col("UTC_FL_DATE").cast(DateType).as("DEP_DATE"),
      col("UTC_ARR_DATE").cast(DateType).as("ARR_DATE"),
      col("UTC_CRS_DEP_TIME").as("DEP_TIME_HHMM"),
      col("UTC_ARR_TIME").as("ARR_TIME_HHMM"),
      col("*")
    )
    
    val weatherBase = weatherDF_pruned
      .select(
        trim(col("WBAN")).as("WBAN"),
        col("Date").cast(DateType).as("WDATE"),
        col("Time").as("WTIME_HHMM"),
        col("WindSpeed").as("ws"),
        col("WindDirection").as("wd"),
        col("DryBulbCelsius").as("tempC"),
        col("SeaLevelPressure").as("slp"),
        col("HourlyPrecip").as("precip"),
        col("feature_weather_severity_index"),
        col("feature_flight_category_ordinal")
      )
      .where(col("WBAN").isNotNull && length(col("WBAN")) > 0 && col("WDATE").isNotNull)
    
    // ---------------------------------------------------------------------
    // 2) Étape "AO" fidèle : pour chaque (A0, date, heure h) on prend
    //    l'observation la plus proche de "date h:00"  (get_hourly_observations)
    // ---------------------------------------------------------------------
    
    // 2.1. Timestamp réel de la mesure météo
    val weatherWithTs = weatherBase
      .withColumn("w_ts", dateTimeFromHHMM(col("WDATE"), col("WTIME_HHMM")))
      .na.drop(Seq("w_ts"))
    
    // 2.2. Générer, pour chaque (WBAN, WDATE), les 24 "instants cibles" h:00
    val buckets = weatherWithTs
      .select("WBAN", "WDATE")
      .distinct()
      .withColumn("hourBucket", explode(sequence(lit(0), lit(23)))) // 0..23
      .withColumn(
        "target_ts",
        to_timestamp(
          concat(
            date_format(col("WDATE"), "yyyy-MM-dd"), lit(" "),
            lpad(col("hourBucket").cast("string"), 2, "0"), lit(":00")
          ),
          "yyyy-MM-dd HH:mm"
        )
      )
    
    // 2.3. Pour chaque (WBAN, WDATE, hourBucket) on choisit l'obs la plus proche
    val bucketsJoined = buckets
      .join(
        weatherWithTs,
        Seq("WBAN", "WDATE"),
        "inner"
      )
      .withColumn(
        "dist",
        abs(col("w_ts").cast("long") - col("target_ts").cast("long"))
      )
    
    val w = Window
      .partitionBy(col("WBAN"), col("WDATE"), col("hourBucket"))
      .orderBy(col("dist").asc_nulls_last)
    
    val weatherHourlyBest = bucketsJoined
      .withColumn("rn", row_number().over(w))
      .where(col("rn") === 1)
      .drop("dist", "rn")
    
    // À ce stade, weatherHourlyBest = AO “compressé” :
    //   (WBAN, WDATE, hourBucket, target_ts, w_ts, ws, wd, tempC, ...)
    // qui correspond très fidèlement à get_hourly_observations sur la grille horaire.
    
    // ---------------------------------------------------------------------
    // 3) Gestion J / J+1 via relHour (comme dans ton code mais sur les buckets)
    // ---------------------------------------------------------------------
    
    val weatherBucketsStruct = struct(
      col("hourBucket").as("hour"),
      col("WBAN"),
      col("WDATE"),
      col("target_ts"),
      col("w_ts"),
      col("ws"), col("wd"),
      col("tempC"), col("slp"), col("precip"),
      col("feature_weather_severity_index"),
      col("feature_flight_category_ordinal")
    )
    
    // Jour D : relHour = 0..23, DATE = WDATE
    val meteoSameDay = weatherHourlyBest
      .withColumn("relHour", col("hourBucket"))
      .withColumn("DATE", col("WDATE"))
      .select(
        col("WBAN"), col("DATE"),
        col("relHour"),
        weatherBucketsStruct.as("wobs")
      )
    
    // Jour D+1 : relHour = hourBucket-24  => [-24 .. -1], DATE = WDATE+1
    val meteoNextDay = weatherHourlyBest
      .withColumn("relHour", col("hourBucket") - lit(24))
      .withColumn("DATE", date_add(col("WDATE"), 1))
      .select(
        col("WBAN"), col("DATE"),
        col("relHour"),
        weatherBucketsStruct.as("wobs")
      )
    
    val weatherRel = meteoSameDay.unionByName(meteoNextDay)
      .filter(col("relHour").between(-24, 23))
    
    // 3.1. Reduce météo par clé (WBAN, DATE) -> Map relHour -> struct (AO final)
    val weatherByKey = weatherRel
      .groupBy(col("WBAN"), col("DATE"))
      .agg(
        map_from_entries(
          collect_list(struct(col("relHour"), col("wobs")))
        ).as("wmap")
      )
      // hash partitionnement sur la join key (Map/Partition)
      .repartition(col("WBAN"), col("DATE"))
      .persist()
    
    // ---------------------------------------------------------------------
    // 4) JOIN #1 — ORIGIN : reproduction de l'ALG.1 sur l'aéroport d'origine
    // ---------------------------------------------------------------------
    
    val numParts = 64
    spark.conf.set("spark.sql.shuffle.partitions", numParts)
    
    // 4.1. Timestamp & heure du départ (floor à l'heure)
    val flightsDep = flightsBase
      .withColumn("DEP_TS", flightTsCol(col("DEP_DATE"), col("DEP_TIME_HHMM")))
      .withColumn("depHour", hhmmHourCol(col("DEP_TIME_HHMM")))
      .na.fill(Map("depHour" -> 0))
    
    // 4.2. Join sur (A0, Date(t_sd))  ~ join_key = <A0, Date(t_sd)>
    val originPre = flightsDep
      .repartition(numParts, col("ORIGIN_WBAN"), col("DEP_DATE"))
      .join(
        weatherByKey.hint("shuffle_hash"),
        col("ORIGIN_WBAN") === weatherByKey("WBAN") &&
        col("DEP_DATE")    === weatherByKey("DATE"),
        "left"
      )
      .drop(weatherByKey("WBAN"))
      .drop(weatherByKey("DATE"))
    
    // 4.3. get_hourly_observations(AO, f_tsd) via wmap[depHour - i]
    val originWithWoArr = originPre
      .withColumn(
        "Wo",
        expr("transform(sequence(1, 12), i -> element_at(wmap, depHour - i))")
      )
      .drop("wmap")
    
    val woCols = (1 to 12).map(i => col("Wo").getItem(i - 1).as(s"Wo_h$i"))
    
    val originDF = originWithWoArr
      .select(col("*") +: woCols: _*)
      .drop("Wo")
      .persist()
    
    // ---------------------------------------------------------------------
    // 5) JOIN #2 — DESTINATION : même algo appliqué à A_d
    // ---------------------------------------------------------------------
    
    val flightsArr = originDF
      .withColumn("ARR_TS", flightTsCol(col("ARR_DATE"), col("ARR_TIME_HHMM")))
      .withColumn("arrHour", hhmmHourCol(col("ARR_TIME_HHMM")))
      .na.fill(Map("arrHour" -> 0))
    
    val destPre = flightsArr
      .repartition(numParts, col("DEST_WBAN"), col("ARR_DATE"))
      .join(
        weatherByKey.hint("shuffle_hash"),
        col("DEST_WBAN") === weatherByKey("WBAN") &&
        col("ARR_DATE")  === weatherByKey("DATE"),
        "left"
      )
      .drop(weatherByKey("WBAN"))
      .drop(weatherByKey("DATE"))
    
    val destWithWdArr = destPre
      .withColumn(
        "Wd",
        expr("transform(sequence(1, 12), i -> element_at(wmap, arrHour - i))")
      )
      .drop("wmap")
    
    val wdCols = (1 to 12).map(i => col("Wd").getItem(i - 1).as(s"Wd_h$i"))
    
    val joinedDF = destWithWdArr
      .select(col("*") +: wdCols: _*)
      .drop("Wd")
      .persist()
    
    // ---------------------------------------------------------------------
    // 6) Action finale (pour déclencher l'exécution)
    // ---------------------------------------------------------------------
    
    println(
      s"Rows after ORIGIN join: ${originDF.count()}, rows after DEST join: ${joinedDF.count()}"
    )
    
}

defined object Metrics


Rows after ORIGIN join: 3908461, rows after DEST join: 3908461
[METRIC][flight-joins-df2] Build flights joins DataFrame2 took 11004.70 ms


In [6]:
import com.flightdelay.features.joiners.FlightWeatherDataJoiner


// =====================================================================
// Flight × Weather en DataFrame (Map → Hash partition → Reduce)
// - Fenêtre 12h avant départ (Wo_*) et 12h avant arrivée (Wd_*)
// - Gestion veille via relHour (duplication J et J+1)
// - Un seul job global avec Metrics.withJob
// =====================================================================
import org.apache.spark.sql.{DataFrame, Row, Column}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

// ------------------------
// 0) Objet Metrics (inchangé)
// ------------------------
object Metrics {
  def withJob[T](id: String, desc: String)(body: => T): T = {
    val sc = spark.sparkContext
    sc.setJobGroup(id, desc, interruptOnCancel = false)
    val t0 = System.nanoTime()
    val res = body
    val dtMs = (System.nanoTime() - t0) / 1e6
    println(f"[METRIC][$id] $desc took ${dtMs}%.2f ms")
    sc.clearJobGroup()
    res
  }
  import org.apache.spark.storage.StorageLevel
  def persistCount(id: String, desc: String, rdd: org.apache.spark.rdd.RDD[_], lvl: StorageLevel = StorageLevel.MEMORY_ONLY): Long = {
    rdd.persist(lvl); val n = withJob(id, desc) { rdd.count() }
    println(s"[METRIC][$id] records=$n partitions=${rdd.getNumPartitions}"); n
  }
  def persistCountDF(id: String, desc: String, df: DataFrame, lvl: StorageLevel = StorageLevel.MEMORY_ONLY): Long = {
    df.persist(lvl); val n = withJob(id, desc) { df.count() }
    println(s"[METRIC][$id] records=$n partitions=${df.rdd.getNumPartitions}"); n
  }
}

// =====================================================================
// 1) Jointure complète exécutée sous un SEUL job
// =====================================================================


// ------------------------
// Paramètres de partitions "reducers"
// ------------------------

Metrics.withJob("flight-joins-df-In-FlightWeatherDataJoiner", "Build flights joins DataFrame in class FlightWeatherDataJoiner") {
    // =====================================================================
    // Jointure Flight × Weather la plus fidèle possible à Belcastro et al.
    // - Implémentation DataFrame, Map/Reduce-like
    // - get_hourly_observations respecté via pré-calcul des buckets horaires
    // =====================================================================

    // Extraire les noms de colonnes depuis les Maps de configuration
    val flightCols = experiment.featureExtraction.flightSelectedFeatures.map(_.keys.toSeq)
    val weatherCols = experiment.featureExtraction.weatherSelectedFeatures.map(_.keys.toSeq)

    println(flightCols)
    
    val joinedDF = FlightWeatherDataJoiner.joinFlightsWithWeather(
        flightDF_mCovered,
        weatherDF_pruned,
        experiment.featureExtraction.weatherOriginDepthHours,
        experiment.featureExtraction.weatherDestinationDepthHours,
        true,
        flightCols,
        weatherCols
    )

    println(
      s"Rows after ORIGIN join: ${flightDF_mCovered.count()}, rows after DEST join: ${joinedDF.count()}"
    )
}    

Some(Vector(OP_CARRIER_AIRLINE_ID, DEST_AIRPORT_ID, ORIGIN_AIRPORT_ID, CRS_ELAPSED_TIME, feature_arrival_time_period, CRS_DEP_TIME, feature_flight_week_of_year, feature_departure_time_period, feature_departure_hour_rounded))
[INFO] [FlightWeatherJoiner] Starting origin join with 3 hours depth
[INFO] [FlightWeatherJoiner] Weather bucketing completed for origin
[INFO] [FlightWeatherJoiner] Colonnes ajoutées automatiquement pour origin: ORIGIN_WBAN, feature_utc_departure_hour_rounded, UTC_ARR_DATE, DEST_WBAN, UTC_FL_DATE, feature_utc_arrival_hour_rounded
[INFO] [FlightWeatherJoiner] Completed origin join
[INFO] [FlightWeatherJoiner] Starting destination join with 3 hours depth
[INFO] [FlightWeatherJoiner] Weather bucketing completed for destination
[INFO] [FlightWeatherJoiner] Colonnes ajoutées automatiquement pour destination: ORIGIN_WBAN, feature_utc_departure_hour_rounded, UTC_ARR_DATE, DEST_WBAN, UTC_FL_DATE, feature_utc_arrival_hour_rounded
[INFO] [FlightWeatherJoiner] Completed dest

org.apache.spark.SparkException: Multiple failures in stage materialization.