# Data Jointure V4

## Configuration

In [1]:
%AddJar file:///home/jovyan/work/apps/Emiasd-Flight-Data-Analysis.jar

Starting download from file:///home/jovyan/work/apps/Emiasd-Flight-Data-Analysis.jar
Finished download of Emiasd-Flight-Data-Analysis.jar
Using cached version of Emiasd-Flight-Data-Analysis.jar


In [2]:
import org.apache.spark.sql.SparkSession
import com.flightdelay.config.{AppConfiguration, ConfigurationLoader, ExperimentConfig}
import com.flightdelay.data.loaders.FlightDataLoader

// Env Configuration
val args: Array[String] = Array("jupyter")
implicit val configuration: AppConfiguration = ConfigurationLoader.loadConfiguration(args)
implicit val experimentConfig: ExperimentConfig = configuration.experiments(0)

val spark = SparkSession.builder()
  .config(sc.getConf)
  .config("spark.eventLog.enabled", "true")
  .config("spark.eventLog.dir", s"${configuration.common.output.basePath}/spark-events")  // ex: "file:/tmp/spark-events" ou "hdfs:///spark-events"
  .getOrCreate()

// Rendre la session Spark implicite
implicit val session = spark



args = Array(jupyter)
configuration = AppConfiguration(local,CommonConfig(42,true,debug,false,false,DataConfig(/home/jovyan/work/data,FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Flights/201201*.csv),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Weather/20101*.txt),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/wban_airport_timezone.csv)),OutputConfig(/home/jovyan/work/output,FileConfig(/home/jovyan/work/output/data),FileConfig(/home/jovyan/work/output/model),None),MLFlowConfig(false,http://localhost:5555),/scripts),Stream(ExperimentConfig(Experience-local,Ba...


AppConfiguration(local,CommonConfig(42,true,debug,false,false,DataConfig(/home/jovyan/work/data,FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Flights/201201*.csv),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Weather/20101*.txt),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/wban_airport_timezone.csv)),OutputConfig(/home/jovyan/work/output,FileConfig(/home/jovyan/work/output/data),FileConfig(/home/jovyan/work/output/model),None),MLFlowConfig(false,http://localhost:5555),/scripts),Stream(ExperimentConfig(Experience-local,Ba...

## Chargement des données

In [3]:
val flightDFPath = s"${configuration.common.output.basePath}/common/data/processed_flights.parquet"
val flightData = spark.read.parquet(flightDFPath)

println("Flight DF Count: ", flightData.count())

(Flight DF Count: ,3908458)


flightDFPath = /home/jovyan/work/output/common/data/processed_flights.parquet
flightData = [OP_CARRIER_AIRLINE_ID: int, DEST_AIRPORT_ID: int ... 32 more fields]


[OP_CARRIER_AIRLINE_ID: int, DEST_AIRPORT_ID: int ... 32 more fields]

In [4]:
val weatherDFPath = s"${configuration.common.output.basePath}/common/data/processed_weather.parquet"
val weatherData = spark.read.parquet(weatherDFPath)

println("Weather DF Count: ", weatherData.count())

weatherDFPath = /home/jovyan/work/output/common/data/processed_weather.parquet
weatherData = [RelativeHumidity: double, feature_visibility_category: string ... 16 more fields]


(Weather DF Count: ,1549320)


[RelativeHumidity: double, feature_visibility_category: string ... 16 more fields]

## Préparation

In [5]:
import com.flightdelay.features.balancer.DelayBalancedDatasetBuilder
import com.flightdelay.utils.DebugUtils._
import com.flightdelay.utils.MetricsUtils

val weatherOriginDepthHours = experimentConfig.featureExtraction.weatherOriginDepthHours
val weatherDestinationDepthHours = experimentConfig.featureExtraction.weatherDestinationDepthHours


val labeledFlightData =  DelayBalancedDatasetBuilder.prepareLabeledDataset(
  df = flightData,
  dxCol = experimentConfig.featureExtraction.dxCol
)

val flightFeaturesWithTarget = experimentConfig.featureExtraction.flightSelectedFeatures.map { features =>
    val featureNames = features.keys.toSeq
    if (featureNames.contains(experimentConfig.target)) {
      featureNames
    } else {
      info(s"  - Automatically adding target '${experimentConfig.target}' to flight features")
      featureNames :+ experimentConfig.target
    }
}

val weatherFeatures = experimentConfig.featureExtraction.weatherSelectedFeatures.map(_.keys.toSeq)

weatherOriginDepthHours = 3
weatherDestinationDepthHours = 3
labeledFlightData = [OP_CARRIER_AIRLINE_ID: int, DEST_AIRPORT_ID: int ... 33 more fields]
flightFeaturesWithTarget = Some(Vector(OP_CARRIER_AIRLINE_ID, DEST_AIRPORT_ID, ORIGIN_AIRPORT_ID, feature_departure_hour_rounded_cos, CRS_ELAPSED_TIME, feature_departure_hour_rounded_sin, feature_arrival_time_period, feature_flight_week_of_year_cos, CRS_DEP_TIME, feature_flight_week_of_year_sin, feature_departure_time_period, is_delayed))
weatherFeatures = Some(Vector(RelativeHumidity, feature_visibility_c...


[INFO] 25/12/11 22:09:31   - Automatically adding target 'is_delayed' to flight features


Some(Vector(RelativeHumidity, feature_visibility_c...

In [6]:
// =====================================================================
// Flight × Weather en DataFrame (Map → Hash partition → Reduce)
// - Fenêtre 12h avant départ (Wo_*) et 12h avant arrivée (Wd_*)
// - Gestion veille via relHour (duplication J et J+1)
// - Un seul job global avec Metrics.withJob
// =====================================================================
import org.apache.spark.sql.{DataFrame, Row, Column}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._


// ------------------------
// Paramètres de partitions "reducers"
// ------------------------


val cores = spark.sparkContext.defaultParallelism
def pickParts(mult: Double, minAbs: Int, maxAbs: Int): Int =
math.min(maxAbs, math.max(minAbs, math.round(cores * mult).toInt))

val numPartsOrigin = pickParts(3.3, 32, 128) // ≈ 40
val numPartsDest   = pickParts(5.2, 48, 192) // ≈ 64
spark.conf.set("spark.sql.shuffle.partitions", numPartsDest) // borne haute DF

// ------------------------
// Utilitaire HHMM -> hour [0..23] (sans UDF) (Map)
// ------------------------
def hhmmHourCol(c: Column): Column = {
  val s  = regexp_replace(c.cast("string"), ":", "")
  val p4 = lpad(s, 4, "0")
  (substring(p4, 1, 2).cast("int") % 24)
}



cores = 14
numPartsOrigin = 46
numPartsDest = 73


pickParts: (mult: Double, minAbs: Int, maxAbs: Int)Int
hhmmHourCol: (c: org.apache.spark.sql.Column)org.apache.spark.sql.Column


73

In [8]:
import org.apache.spark.sql.{DataFrame, Column}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.storage.StorageLevel
import com.flightdelay.utils.MetricsUtils

MetricsUtils.withUiLabels(
  groupId = "NoteBook.FeaturePipeline.join",
  desc    = "NoteBook.FeaturePipeline.join",
  tags    = "sampling,split,balance"
) {

  // ------------------------
  // Sélection des colonnes utiles (MAP)
  // ------------------------
  val flights = labeledFlightData

  val weather = weatherData
    .select(
      col("Date").cast(DateType).as("WDATE"),
      col("Time").as("WTIME_HHMM"),
      col("*")
    )
    .where(
      col("WBAN").isNotNull &&
      length(col("WBAN")) > 0 &&
      col("WDATE").isNotNull
    )

  // ------------------------
  // Préparation météo avec relHour + duplication J/J+1 (MAP)
  // ------------------------
  val weatherWithHour = weather
    .withColumn("hour", hhmmHourCol(col("WTIME_HHMM")))
    .na.fill(Map("hour" -> -1))
    .persist(StorageLevel.MEMORY_AND_DISK)

  // matérialise le cache météo brut (1 seule lecture parquet)
  weatherWithHour.count()

  val meteoSameDay = weatherWithHour
    .withColumn("relHour", col("hour"))
    .withColumn("DATE", col("WDATE"))

  val meteoNextDay = weatherWithHour
    .withColumn("relHour", col("hour") - lit(24))
    .withColumn("DATE", date_add(col("WDATE"), 1))

  val weatherRel = meteoSameDay
    .unionByName(meteoNextDay)
    .filter(col("relHour").between(-24, 23))

  // ------------------------
  // Reduce météo par clé → Map relHour -> struct (REDUCE)
  // ------------------------

  val staticCols = Seq(
    col("relHour").as("hour"),
    col("WBAN"),
    col("WDATE"),
    col("WTIME_HHMM")
  )

  val staticNames = Set("relHour", "WBAN", "WDATE", "WTIME_HHMM")

  val allWeatherCols: Seq[String] =
    weatherFeatures.getOrElse(Seq.empty[String])

  val dynamicFeatureCols =
    allWeatherCols
      .filterNot(staticNames.contains)
      .map(c => col(c).alias(c))

  val weatherStruct = struct((staticCols ++ dynamicFeatureCols): _*)

  val weatherByKey: DataFrame =
    weatherRel
      .groupBy(col("WBAN"), col("DATE"))
      .agg(
        map_from_entries(
          collect_list(struct(col("relHour"), weatherStruct))
        ).as("wmap")
      )
      .persist(StorageLevel.MEMORY_AND_DISK)

  // matérialise la météo agrégée (évite de tout recalculer dans les jobs suivants)
  weatherByKey.count()

  // ------------------------
  // JOIN #1 — ORIGIN
  // ------------------------
  val flightsDep = flights
    .withColumn(
      "depHour",
      coalesce(hhmmHourCol(col("UTC_CRS_DEP_TIME")), lit(0))
    )

  val originPre = flightsDep
    .join(
      // on garde un shuffle_hash join, plus safe que le broadcast en mémoire limitée
      weatherByKey.hint("shuffle_hash"),
      col("ORIGIN_WBAN") === weatherByKey("WBAN") &&
      col("UTC_FL_DATE") === weatherByKey("DATE"),
      "left"   // mets "inner" si tu veux supprimer les vols sans météo
    )
    .drop(weatherByKey("WBAN"))
    .drop(weatherByKey("DATE"))

  val originWithWoArr = originPre
    .withColumn(
      "Wo",
      expr("transform(sequence(1, 3), i -> element_at(wmap, depHour - i))")
    )
    .drop("wmap")

  val woCols = (0 until 3).map(i => col("Wo").getItem(i).as(s"Wo_h${i + 1}"))

  val originDF = originWithWoArr
    .select(col("*") +: woCols: _*)
    .drop("Wo")
    // plus de persist ici : dataset très gros, une seule vraie réutilisation

  // ------------------------
  // JOIN #2 — DEST
  // ------------------------
  val flightsArr = originDF
    .withColumn(
      "arrHour",
      coalesce(hhmmHourCol(col("UTC_ARR_TIME")), lit(0))
    )

  val destPre = flightsArr
    .join(
      weatherByKey.hint("shuffle_hash"),
      col("DEST_WBAN") === weatherByKey("WBAN") &&
      col("UTC_ARR_DATE") === weatherByKey("DATE"),
      "left"   // idem, "inner" si tu veux filtrer les vols sans météo
    )
    .drop(weatherByKey("WBAN"))
    .drop(weatherByKey("DATE"))

  val destWithWdArr = destPre
    .withColumn(
      "Wd",
      expr("transform(sequence(1, 3), i -> element_at(wmap, arrHour - i))")
    )
    .drop("wmap")

  val wdCols = (0 until 3).map(i => col("Wd").getItem(i).as(s"Wd_h${i + 1}"))

  val baseCols: Seq[Column] =
    originDF.columns.map(col).toSeq

  val joinedDF = destWithWdArr
    .select((baseCols ++ wdCols): _*)
    .drop("Wd")
    // pas de persist ici non plus : très gros DF, utilisé seulement pour le count

  // ------------------------
  // Actions finales (déclenchent l'exécution)
  // ------------------------
  println(
    s"Rows after ORIGIN join: ${originDF.count()}, " +
      s"rows after DEST join: ${joinedDF.count()}"
  )

  // Optionnel : libérer la mémoire si tu enchaînes d'autres traitements
  // weatherByKey.unpersist()
  // weatherWithHour.unpersist()
}

[INFO] 25/12/11 22:13:11 NoteBook.FeaturePipeline.join → Starting job: NoteBook.FeaturePipeline.join
Exception in thread "Executor task launch worker for task 3.0 in stage 40.0 (TID 532)" java.lang.SecurityException: Not allowed to invoke System.exit!
	at org.apache.toree.security.KernelSecurityManager.checkExit(KernelSecurityManager.scala:133)
	at java.base/java.lang.Runtime.halt(Runtime.java:277)
	at org.apache.spark.util.SparkUncaughtExceptionHandler.uncaughtException(SparkUncaughtExceptionHandler.scala:76)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:826)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)


lastException = null


org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 40.0 failed 1 times, most recent failure: Lost task 3.0 in stage 40.0 (TID 532) (jupyter-spark executor driver): java.lang.OutOfMemoryError: Java heap space
	at java.base/java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:61)
	at java.base/java.nio.ByteBuffer.allocate(ByteBuffer.java:348)
	at org.apache.spark.sql.execution.columnar.BasicColumnBuilder.build(ColumnBuilder.scala:81)
	at org.apache.spark.sql.execution.columnar.ComplexColumnBuilder.org$apache$spark$sql$execution$columnar$NullableColumnBuilder$$super$build(ColumnBuilder.scala:93)
	at org.apache.spark.sql.execution.columnar.NullableColumnBuilder.build(NullableColumnBuilder.scala:67)
	at org.apache.spark.sql.execution.columnar.NullableColumnBuilder.build$(NullableColumnBuilder.scala:66)
	at org.apache.spark.sql.execution.columnar.ComplexColumnBuilder.build(ColumnBuilder.scala:93)
	at org.apache.spark.sql.execution.columnar.DefaultCachedBatchSerializer$$anon$1.$anonfun$next$4(InMemoryRelation.scala:115)
	at org.apache.spark.sql.execution.columnar.DefaultCachedBatchSerializer$$anon$1$$Lambda$5956/0x0000000801fc9840.apply(Unknown Source)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
	at scala.collection.TraversableLike$$Lambda$151/0x00000008002d6840.apply(Unknown Source)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:198)
	at org.apache.spark.sql.execution.columnar.DefaultCachedBatchSerializer$$anon$1.next(InMemoryRelation.scala:114)
	at org.apache.spark.sql.execution.columnar.DefaultCachedBatchSerializer$$anon$1.next(InMemoryRelation.scala:80)
	at org.apache.spark.sql.execution.columnar.CachedRDDBuilder$$anon$2.next(InMemoryRelation.scala:290)
	at org.apache.spark.sql.execution.columnar.CachedRDDBuilder$$anon$2.next(InMemoryRelation.scala:287)
	at org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:224)
	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:302)
	at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1597)
	at org.apache.spark.storage.BlockManager$$Lambda$3380/0x0000000801498040.apply(Unknown Source)
	at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1524)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1588)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1389)
	at org.apache.spark.storage.BlockManager.getOrElseUpdateRDDBlock(BlockManager.scala:1343)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:379)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)

Driver stacktrace:

In [9]:
joinedDF.printSchema

root
 |-- OP_CARRIER_AIRLINE_ID: integer (nullable = true)
 |-- DEST_AIRPORT_ID: integer (nullable = true)
 |-- ORIGIN_AIRPORT_ID: integer (nullable = true)
 |-- feature_departure_hour_rounded_cos: double (nullable = true)
 |-- CRS_ELAPSED_TIME: double (nullable = true)
 |-- feature_departure_hour_rounded_sin: double (nullable = true)
 |-- feature_arrival_time_period: string (nullable = true)
 |-- feature_flight_week_of_year_cos: double (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- feature_flight_week_of_year_sin: double (nullable = true)
 |-- feature_departure_time_period: string (nullable = true)
 |-- ARR_DELAY_NEW: double (nullable = true)
 |-- D1: integer (nullable = true)
 |-- D2_15: integer (nullable = true)
 |-- D2_30: integer (nullable = true)
 |-- D2_45: integer (nullable = true)
 |-- D2_60: integer (nullable = true)
 |-- D2_90: integer (nullable = true)
 |-- D3: integer (nullable = true)
 |-- D4: integer (nullable = true)
 |-- UTC_FL_DATE: date (nullable