# Data Jointure V2

## Configuration

In [1]:
%AddJar file:///home/jovyan/work/apps/Emiasd-Flight-Data-Analysis.jar

Using cached version of Emiasd-Flight-Data-Analysis.jar
Using cached version of Emiasd-Flight-Data-Analysis.jar


In [2]:
import org.apache.spark.sql.SparkSession
import com.flightdelay.config.{AppConfiguration, ConfigurationLoader, ExperimentConfig}
import com.flightdelay.data.loaders.FlightDataLoader

// Env Configuration
val args: Array[String] = Array("jupyter")
implicit val configuration: AppConfiguration = ConfigurationLoader.loadConfiguration(args)
implicit val experiment: ExperimentConfig = configuration.experiments(0)

val spark = SparkSession.builder()
  .config(sc.getConf)
  .config("spark.eventLog.enabled", "true")
  .config("spark.eventLog.dir", "file:///home/jovyan/work/spark-events")
  .getOrCreate()

// Rendre la session Spark implicite
implicit val session = spark



args = Array(jupyter)
configuration = AppConfiguration(local,CommonConfig(42,true,debug,false,false,DataConfig(/home/jovyan/work/data,FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Flights/201201*.csv),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Weather/20101*.txt),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/wban_airport_timezone.csv)),OutputConfig(/home/jovyan/work/output,FileConfig(/home/jovyan/work/output/data),FileConfig(/home/jovyan/work/output/model)),MLFlowConfig(false,http://localhost:5555)),Stream(ExperimentConfig(Experience-jupyter,Baseline Rando...


AppConfiguration(local,CommonConfig(42,true,debug,false,false,DataConfig(/home/jovyan/work/data,FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Flights/201201*.csv),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Weather/20101*.txt),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/wban_airport_timezone.csv)),OutputConfig(/home/jovyan/work/output,FileConfig(/home/jovyan/work/output/data),FileConfig(/home/jovyan/work/output/model)),MLFlowConfig(false,http://localhost:5555)),Stream(ExperimentConfig(Experience-jupyter,Baseline Rando...

## Chargement des données

In [3]:
val flightDFPath = s"${configuration.common.output.basePath}/common/data/processed_flights.parquet"
val flightDF = spark.read.parquet(flightDFPath)

println("Flight DF Count: ", flightDF.count())

(Flight DF Count: ,3908461)


flightDFPath = /home/jovyan/work/output/common/data/processed_flights.parquet
flightDF = [OP_CARRIER_AIRLINE_ID: int, DEST_AIRPORT_ID: int ... 28 more fields]


[OP_CARRIER_AIRLINE_ID: int, DEST_AIRPORT_ID: int ... 28 more fields]

In [4]:
val weatherDFPath = s"${configuration.common.output.basePath}/common/data/processed_weather.parquet"
val weatherDF = spark.read.parquet(weatherDFPath)

println("Weather DF Count: ", weatherDF.count())

weatherDFPath = /home/jovyan/work/output/common/data/processed_weather.parquet
weatherDF = [WindDirection: double, PressureChange: double ... 63 more fields]


(Weather DF Count: ,1549320)


[WindDirection: double, PressureChange: double ... 63 more fields]

## Jointure

In [5]:
val flightDF_mCovered = flightDF
val weatherDF_pruned = weatherDF

println("Flight for join count ->", flightDF_mCovered.count())
println("Weather for join count ->", weatherDF_pruned.count())

(Flight for join count ->,3908461)
(Weather for join count ->,1549320)


flightDF_mCovered = [OP_CARRIER_AIRLINE_ID: int, DEST_AIRPORT_ID: int ... 28 more fields]
weatherDF_pruned = [WindDirection: double, PressureChange: double ... 63 more fields]


[WindDirection: double, PressureChange: double ... 63 more fields]

In [8]:
// =====================================================================
// Flight × Weather en DataFrame (Map → Hash partition → Reduce)
// - Fenêtre 12h avant départ (Wo_*) et 12h avant arrivée (Wd_*)
// - Gestion veille via relHour (duplication J et J+1)
// - Un seul job global avec Metrics.withJob
// =====================================================================
import org.apache.spark.sql.{DataFrame, Row, Column}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

// ------------------------
// 0) Objet Metrics (inchangé)
// ------------------------
object Metrics {
  def withJob[T](id: String, desc: String)(body: => T): T = {
    val sc = spark.sparkContext
    sc.setJobGroup(id, desc, interruptOnCancel = false)
    val t0 = System.nanoTime()
    val res = body
    val dtMs = (System.nanoTime() - t0) / 1e6
    println(f"[METRIC][$id] $desc took ${dtMs}%.2f ms")
    sc.clearJobGroup()
    res
  }
  import org.apache.spark.storage.StorageLevel
  def persistCount(id: String, desc: String, rdd: org.apache.spark.rdd.RDD[_], lvl: StorageLevel = StorageLevel.MEMORY_ONLY): Long = {
    rdd.persist(lvl); val n = withJob(id, desc) { rdd.count() }
    println(s"[METRIC][$id] records=$n partitions=${rdd.getNumPartitions}"); n
  }
  def persistCountDF(id: String, desc: String, df: DataFrame, lvl: StorageLevel = StorageLevel.MEMORY_ONLY): Long = {
    df.persist(lvl); val n = withJob(id, desc) { df.count() }
    println(s"[METRIC][$id] records=$n partitions=${df.rdd.getNumPartitions}"); n
  }
}

// =====================================================================
// 1) Jointure complète exécutée sous un SEUL job
// =====================================================================


// ------------------------
// Paramètres de partitions "reducers"
// ------------------------

Metrics.withJob("flight-joins-df", "Build flights joins DataFrame") {
    val cores = spark.sparkContext.defaultParallelism
    def pickParts(mult: Double, minAbs: Int, maxAbs: Int): Int =
    math.min(maxAbs, math.max(minAbs, math.round(cores * mult).toInt))
    
    val numPartsOrigin = pickParts(3.3, 32, 128) // ≈ 40
    val numPartsDest   = pickParts(5.2, 48, 192) // ≈ 64
    spark.conf.set("spark.sql.shuffle.partitions", numPartsDest) // borne haute DF
    
    // ------------------------
    // Sélection des colonnes utiles (Map)
    // ------------------------
    //flightDF_filtered
    val flights = flightDF_mCovered.select(
    col("feature_flight_unique_id").as("flightId"),
    col("ORIGIN_WBAN"), col("DEST_WBAN"),
    col("UTC_FL_DATE").cast(DateType).as("DEP_DATE"),
    col("UTC_ARR_DATE").cast(DateType).as("ARR_DATE"),
    col("UTC_CRS_DEP_TIME").as("DEP_TIME_HHMM"),
    col("UTC_ARR_TIME").as("ARR_TIME_HHMM"),
    col("*")
    )
    
    val weather = weatherDF_pruned.select(
    trim(col("WBAN")).as("WBAN"),
    col("Date").cast(DateType).as("WDATE"),
    col("Time").as("WTIME_HHMM"),
    col("WindSpeed").as("ws"),
    col("WindDirection").as("wd"),
    col("DryBulbCelsius").as("tempC"),
    col("SeaLevelPressure").as("slp"),
    col("HourlyPrecip").as("precip"),
    col("feature_weather_severity_index"),
    col("feature_flight_category_ordinal")
    ).where(col("WBAN").isNotNull && length(col("WBAN")) > 0 && col("WDATE").isNotNull)
    
    // ------------------------
    // Utilitaire HHMM -> hour [0..23] (sans UDF) (Map)
    // ------------------------
    def hhmmHourCol(c: Column): Column = {
    val s  = regexp_replace(c.cast("string"), ":", "")
    val p4 = lpad(s, 4, "0")
    (substring(p4, 1, 2).cast("int") % 24)
    }
    
    // ------------------------
    // Préparation météo avec relHour + duplication J/J+1 (Map)
    // ------------------------
    val weatherWithHour = weather
    .withColumn("hour", hhmmHourCol(col("WTIME_HHMM")))
    .na.fill(Map("hour" -> -1))
    
    val meteoSameDay = weatherWithHour
    .withColumn("relHour", col("hour"))
    .withColumn("DATE", col("WDATE"))
    
    val meteoNextDay = weatherWithHour
    .withColumn("relHour", col("hour") - lit(24))
    .withColumn("DATE", date_add(col("WDATE"), 1))
    
    val weatherRel = meteoSameDay.unionByName(meteoNextDay)
    .filter(col("relHour").between(-24, 23))
    
    // ------------------------
    // Reduce météo par clé → Map relHour -> struct (Reduce)
    // ------------------------
    val weatherStruct =
    struct(
      col("relHour").as("hour"),
      col("WBAN"), col("WDATE"), col("WTIME_HHMM"),
      col("ws"), col("wd"),
      col("tempC"), col("slp"), col("precip"),
      col("feature_weather_severity_index"),
      col("feature_flight_category_ordinal")
    )
    
    val weatherByKey: DataFrame =
    weatherRel
      .groupBy(col("WBAN"), col("DATE"))
      .agg(
        map_from_entries(
          collect_list(struct(col("relHour"), weatherStruct))
        ).as("wmap")
      )
    // (WBAN, DATE, wmap: map<int, struct{hour,WBAN,WDATE,WTIME_HHMM,...}>)
    
    // ------------------------
    // JOIN #1 — ORIGIN (Partition = hash(ORIGIN_WBAN, DEP_DATE))
    // ------------------------
    val flightsDep = flights
    .withColumn("depHour", coalesce(hhmmHourCol(col("DEP_TIME_HHMM")), lit(0)))
    
    val originPre = flightsDep
    .repartition(numPartsOrigin, col("ORIGIN_WBAN"), col("DEP_DATE")) // <-- Hash partition explicite
    .join(
      weatherByKey.hint("shuffle_hash"),
      col("ORIGIN_WBAN") === weatherByKey("WBAN") &&
      col("DEP_DATE")    === weatherByKey("DATE"),
      "left"
    )
    .drop(weatherByKey("WBAN")).drop(weatherByKey("DATE"))
    
    val originWithWoArr = originPre
    .withColumn("Wo", expr("transform(sequence(1, 12), i -> element_at(wmap, depHour - i))"))
    .drop("wmap")
    
    val woCols = (0 until 12).map(i => col("Wo").getItem(i).as(s"Wo_h${i+1}"))
    val originDF = originWithWoArr
    .select(col("*") +: woCols: _*)
    .drop("Wo")
    .persist()
    
    // ------------------------
    // JOIN #2 — DEST (Partition = hash(DEST_WBAN, ARR_DATE))
    // ------------------------
    val flightsArr = originDF
    .withColumn("arrHour", coalesce(hhmmHourCol(col("ARR_TIME_HHMM")), lit(0)))
    
    val destPre = flightsArr
    .repartition(numPartsDest, col("DEST_WBAN"), col("ARR_DATE"))     // <-- Hash partition explicite
    .join(
      weatherByKey.hint("shuffle_hash"),
      col("DEST_WBAN") === weatherByKey("WBAN") &&
      col("ARR_DATE")  === weatherByKey("DATE"),
      "left"
    )
    .drop(weatherByKey("WBAN")).drop(weatherByKey("DATE"))
    
    val destWithWdArr = destPre
    .withColumn("Wd", expr("transform(sequence(1, 12), i -> element_at(wmap, arrHour - i))"))
    .drop("wmap")
    
    val wdCols = (0 until 12).map(i => col("Wd").getItem(i).as(s"Wd_h${i+1}"))
    val joinedDF = destWithWdArr
    .select(col("*") +: wdCols: _*)
    .drop("Wd")
    .persist()
    
    // ------------------------
    // Action finale (déclenche l'exécution)
    // ------------------------
    println(s"Rows after ORIGIN join: ${originDF.count()}, rows after DEST join: ${joinedDF.count()}")
    
}

Rows after ORIGIN join: 3908461, rows after DEST join: 3908461
[METRIC][flight-joins-df] Build flights joins DataFrame took 33418.68 ms


lastException = null
defined object Metrics


null

In [13]:
joinedDF.show(1, 1000, true)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------
 flightId                           | 2013-06-18_19393_3747_12191_12892                                                           
 ORIGIN_WBAN                        | 12918                                                                                       
 DEST_WBAN                          | 23174                                                                                       
 DEP_DATE                           | 2013-06-18                                                                                  
 ARR_DATE                           | 2013-06-19                                                                                  
 DEP_TIME_HHMM                      | 2110                                                                                        
 ARR_TIME_HHMM                      | 0036                                         