# Data Jointure V4

## Configuration

In [1]:
%AddJar file:///home/jovyan/work/apps/Emiasd-Flight-Data-Analysis.jar

Starting download from file:///home/jovyan/work/apps/Emiasd-Flight-Data-Analysis.jar
Finished download of Emiasd-Flight-Data-Analysis.jar
Using cached version of Emiasd-Flight-Data-Analysis.jar


In [2]:
import org.apache.spark.sql.SparkSession
import com.flightdelay.config.{AppConfiguration, ConfigurationLoader, ExperimentConfig}
import com.flightdelay.data.loaders.FlightDataLoader

// Env Configuration
val args: Array[String] = Array("jupyter")
implicit val configuration: AppConfiguration = ConfigurationLoader.loadConfiguration(args)
implicit val experimentConfig: ExperimentConfig = configuration.experiments(0)

val spark = SparkSession.builder()
  .config(sc.getConf)
  .config("spark.eventLog.enabled", "true")
  .config("spark.eventLog.dir", s"${configuration.common.output.basePath}/spark-events")  // ex: "file:/tmp/spark-events" ou "hdfs:///spark-events"
  .getOrCreate()

// Rendre la session Spark implicite
implicit val session = spark



args = Array(jupyter)
configuration = AppConfiguration(local,CommonConfig(42,true,debug,false,false,DataConfig(/home/jovyan/work/data,FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Flights/201201*.csv),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Weather/20101*.txt),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/wban_airport_timezone.csv)),OutputConfig(/home/jovyan/work/output,FileConfig(/home/jovyan/work/output/data),FileConfig(/home/jovyan/work/output/model),None),MLFlowConfig(false,http://localhost:5555),/scripts),Stream(ExperimentConfig(Experience-local,Ba...


AppConfiguration(local,CommonConfig(42,true,debug,false,false,DataConfig(/home/jovyan/work/data,FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Flights/201201*.csv),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Weather/20101*.txt),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/wban_airport_timezone.csv)),OutputConfig(/home/jovyan/work/output,FileConfig(/home/jovyan/work/output/data),FileConfig(/home/jovyan/work/output/model),None),MLFlowConfig(false,http://localhost:5555),/scripts),Stream(ExperimentConfig(Experience-local,Ba...

## Chargement des données

In [3]:
val flightDFPath = s"${configuration.common.output.basePath}/common/data/processed_flights.parquet"
val flightData = spark.read.parquet(flightDFPath)

println("Flight DF Count: ", flightData.count())

(Flight DF Count: ,3908458)


flightDFPath = /home/jovyan/work/output/common/data/processed_flights.parquet
flightData = [OP_CARRIER_AIRLINE_ID: int, DEST_AIRPORT_ID: int ... 32 more fields]


[OP_CARRIER_AIRLINE_ID: int, DEST_AIRPORT_ID: int ... 32 more fields]

In [4]:
val weatherDFPath = s"${configuration.common.output.basePath}/common/data/processed_weather.parquet"
val weatherData = spark.read.parquet(weatherDFPath)

println("Weather DF Count: ", weatherData.count())

weatherDFPath = /home/jovyan/work/output/common/data/processed_weather.parquet
weatherData = [RelativeHumidity: double, feature_visibility_category: string ... 16 more fields]


(Weather DF Count: ,1549320)


[RelativeHumidity: double, feature_visibility_category: string ... 16 more fields]

## Préparation

In [5]:
import com.flightdelay.features.balancer.DelayBalancedDatasetBuilder
import com.flightdelay.utils.DebugUtils._
import com.flightdelay.utils.MetricsUtils

val weatherOriginDepthHours = experimentConfig.featureExtraction.weatherOriginDepthHours
val weatherDestinationDepthHours = experimentConfig.featureExtraction.weatherDestinationDepthHours


val labeledFlightData =  DelayBalancedDatasetBuilder.prepareLabeledDataset(
  df = flightData,
  dxCol = experimentConfig.featureExtraction.dxCol
)

val flightFeaturesWithTarget = experimentConfig.featureExtraction.flightSelectedFeatures.map { features =>
    val featureNames = features.keys.toSeq
    if (featureNames.contains(experimentConfig.target)) {
      featureNames
    } else {
      info(s"  - Automatically adding target '${experimentConfig.target}' to flight features")
      featureNames :+ experimentConfig.target
    }
}

val weatherFeatures = experimentConfig.featureExtraction.weatherSelectedFeatures.map(_.keys.toSeq)

weatherOriginDepthHours = 3
weatherDestinationDepthHours = 3
labeledFlightData = [OP_CARRIER_AIRLINE_ID: int, DEST_AIRPORT_ID: int ... 33 more fields]
flightFeaturesWithTarget = Some(Vector(OP_CARRIER_AIRLINE_ID, DEST_AIRPORT_ID, ORIGIN_AIRPORT_ID, feature_departure_hour_rounded_cos, CRS_ELAPSED_TIME, feature_departure_hour_rounded_sin, feature_arrival_time_period, feature_flight_week_of_year_cos, CRS_DEP_TIME, feature_flight_week_of_year_sin, feature_departure_time_period, is_delayed))
weatherFeatures = Some(Vector(RelativeHumidity, feature_visibility_c...


[INFO] 25/12/11 21:06:50   - Automatically adding target 'is_delayed' to flight features


Some(Vector(RelativeHumidity, feature_visibility_c...

In [6]:
// =====================================================================
// Flight × Weather en DataFrame (Map → Hash partition → Reduce)
// - Fenêtre 12h avant départ (Wo_*) et 12h avant arrivée (Wd_*)
// - Gestion veille via relHour (duplication J et J+1)
// - Un seul job global avec Metrics.withJob
// =====================================================================
import org.apache.spark.sql.{DataFrame, Row, Column}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._


// ------------------------
// Paramètres de partitions "reducers"
// ------------------------


val cores = spark.sparkContext.defaultParallelism
def pickParts(mult: Double, minAbs: Int, maxAbs: Int): Int =
math.min(maxAbs, math.max(minAbs, math.round(cores * mult).toInt))

val numPartsOrigin = pickParts(3.3, 32, 128) // ≈ 40
val numPartsDest   = pickParts(5.2, 48, 192) // ≈ 64
spark.conf.set("spark.sql.shuffle.partitions", numPartsDest) // borne haute DF

// ------------------------
// Utilitaire HHMM -> hour [0..23] (sans UDF) (Map)
// ------------------------
def hhmmHourCol(c: Column): Column = {
  val s  = regexp_replace(c.cast("string"), ":", "")
  val p4 = lpad(s, 4, "0")
  (substring(p4, 1, 2).cast("int") % 24)
}



cores = 14
numPartsOrigin = 46
numPartsDest = 73


pickParts: (mult: Double, minAbs: Int, maxAbs: Int)Int
hhmmHourCol: (c: org.apache.spark.sql.Column)org.apache.spark.sql.Column


73

In [7]:

import com.flightdelay.utils.MetricsUtils

MetricsUtils.withUiLabels(
  groupId = "NoteBook.FeaturePipeline.join",
  desc    = "NoteBook.FeaturePipeline.join",
  tags    = "sampling,split,balance"
) {
        
    // ------------------------
    // Sélection des colonnes utiles (Map)
    // ------------------------
    //flightDF_filtered
    val flights = labeledFlightData
    
    val weather = weatherData.select(
        col("Date").cast(DateType).as("WDATE"),
        col("Time").as("WTIME_HHMM"),
        col("*")
    ).where(col("WBAN").isNotNull && length(col("WBAN")) > 0 && col("WDATE").isNotNull)
    
    
    
    // ------------------------
    // Préparation météo avec relHour + duplication J/J+1 (Map)
    // ------------------------
    val weatherWithHour = weather
      .withColumn("hour", hhmmHourCol(col("WTIME_HHMM")))
      .na.fill(Map("hour" -> -1))
    
    val meteoSameDay = weatherWithHour
      .withColumn("relHour", col("hour"))
      .withColumn("DATE", col("WDATE"))
    
    val meteoNextDay = weatherWithHour
      .withColumn("relHour", col("hour") - lit(24))
      .withColumn("DATE", date_add(col("WDATE"), 1))
    
    val weatherRel = meteoSameDay.unionByName(meteoNextDay)
      .filter(col("relHour").between(-24, 23))
    
    // ------------------------
    // Reduce météo par clé → Map relHour -> struct (Reduce)
    // ------------------------
    
    // 1) Récupérer les features météo depuis la config (Option[Map[...]])
    
    
    // 2) Colonnes fixes dans la struct
    val staticCols = Seq(
      col("relHour").as("hour"),
      col("WBAN"),
      col("WDATE"),
      col("WTIME_HHMM")
    )
    
    // 3) Colonnes dynamiques venant de la config
    val staticNames = Set("relHour", "WBAN", "WDATE", "WTIME_HHMM")
    
    val allWeatherCols: Seq[String] =
      weatherFeatures.getOrElse(Seq.empty[String])
    
    val dynamicFeatureCols =
      allWeatherCols
        .filterNot(staticNames.contains)
        .map(c => col(c).alias(c))
    
    // 4) Struct finale : [hour, WBAN, WDATE, WTIME_HHMM, <features>...]
    val weatherStruct =
      struct((staticCols ++ dynamicFeatureCols): _*)
    
    // 5) Agrégation en map relHour -> struct(...)
    val weatherByKey: DataFrame =
      weatherRel
        .groupBy(col("WBAN"), col("DATE"))
        .agg(
          map_from_entries(
            collect_list(struct(col("relHour"), weatherStruct))
          ).as("wmap")
        )
    
    // ------------------------
    // JOIN #1 — ORIGIN (Partition = hash(ORIGIN_WBAN, UTC_FL_DATE))
    // ------------------------
    val flightsDep = flights
    .withColumn("depHour", coalesce(hhmmHourCol(col("UTC_CRS_DEP_TIME")), lit(0)))
    
    val originPre = flightsDep
    .repartition(numPartsOrigin, col("ORIGIN_WBAN"), col("UTC_FL_DATE")) // <-- Hash partition explicite
    .join(
      weatherByKey.hint("shuffle_hash"),
      col("ORIGIN_WBAN") === weatherByKey("WBAN") &&
      col("UTC_FL_DATE")    === weatherByKey("DATE"),
      "left"
    )
    .drop(weatherByKey("WBAN")).drop(weatherByKey("DATE"))
    
    val originWithWoArr = originPre
    .withColumn("Wo", expr("transform(sequence(1, 3), i -> element_at(wmap, depHour - i))"))
    .drop("wmap")
    
    val woCols = (0 until 3).map(i => col("Wo").getItem(i).as(s"Wo_h${i+1}"))
    val originDF = originWithWoArr
    .select(col("*") +: woCols: _*)
    .drop("Wo")
    .persist()
    
    // ------------------------
    // JOIN #2 — DEST (Partition = hash(DEST_WBAN, UTC_ARR_DATE))
    // ------------------------
    val flightsArr = originDF
    .withColumn("arrHour", coalesce(hhmmHourCol(col("UTC_ARR_TIME")), lit(0)))
    
    val destPre = flightsArr
    .repartition(numPartsDest, col("DEST_WBAN"), col("UTC_ARR_DATE"))     // <-- Hash partition explicite
    .join(
      weatherByKey.hint("shuffle_hash"),
      col("DEST_WBAN") === weatherByKey("WBAN") &&
      col("UTC_ARR_DATE")  === weatherByKey("DATE"),
      "left"
    )
    .drop(weatherByKey("WBAN")).drop(weatherByKey("DATE"))
    
    val destWithWdArr = destPre
    .withColumn("Wd", expr("transform(sequence(1, 3), i -> element_at(wmap, arrHour - i))"))
    .drop("wmap")
    
    val wdCols = (0 until 3).map(i => col("Wd").getItem(i).as(s"Wd_h${i+1}"))
    
    val baseCols: Seq[org.apache.spark.sql.Column] =
      originDF.columns.map(col).toSeq
    
    val joinedDF = destWithWdArr
      .select( (baseCols ++ wdCols): _* )
      .drop("Wd")
      .persist()
    
    // ------------------------
    // Action finale (déclenche l'exécution)
    // ------------------------
    println(s"Rows after ORIGIN join: ${originDF.count()}, rows after DEST join: ${joinedDF.count()}")
}    


Rows after ORIGIN join: 3908458, rows after DEST join: 3908458


flights = [OP_CARRIER_AIRLINE_ID: int, DEST_AIRPORT_ID: int ... 33 more fields]
weather = [WDATE: date, WTIME_HHMM: string ... 18 more fields]
weatherWithHour = [WDATE: date, WTIME_HHMM: string ... 19 more fields]
meteoSameDay = [WDATE: date, WTIME_HHMM: string ... 20 more fields]
meteoNextDay = [WDATE: date, WTIME_HHMM: string ... 20 more fields]
weatherRel = [WDATE: date, WTIME_HHMM: string ... 20 more fields]
staticCols = List(relHour AS hour, WBAN, WDATE, WTIME_HHMM)


staticNames: scala.collection.immutable.Set[Str...


List(relHour AS hour, WBAN, WDATE, WTIME_HHMM)

In [22]:
joinedDF.printSchema

root
 |-- OP_CARRIER_AIRLINE_ID: integer (nullable = true)
 |-- DEST_AIRPORT_ID: integer (nullable = true)
 |-- ORIGIN_AIRPORT_ID: integer (nullable = true)
 |-- CRS_ELAPSED_TIME: double (nullable = true)
 |-- feature_arrival_time_period: string (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- feature_flight_week_of_year: integer (nullable = true)
 |-- feature_departure_time_period: string (nullable = true)
 |-- feature_departure_hour_rounded: long (nullable = true)
 |-- ARR_DELAY_NEW: double (nullable = true)
 |-- D1: integer (nullable = true)
 |-- D2_15: integer (nullable = true)
 |-- D2_30: integer (nullable = true)
 |-- D2_45: integer (nullable = true)
 |-- D2_60: integer (nullable = true)
 |-- D2_90: integer (nullable = true)
 |-- D3: integer (nullable = true)
 |-- D4: integer (nullable = true)
 |-- UTC_FL_DATE: date (nullable = true)
 |-- feature_utc_departure_hour_rounded: long (nullable = true)
 |-- feature_utc_arrival_hour_rounded: string (nullable = true)
 