# Data Jointure V2

## Configuration

In [1]:
%AddJar file:///home/jovyan/work/apps/Emiasd-Flight-Data-Analysis.jar

Starting download from file:///home/jovyan/work/apps/Emiasd-Flight-Data-Analysis.jar
Finished download of Emiasd-Flight-Data-Analysis.jar
Using cached version of Emiasd-Flight-Data-Analysis.jar


In [2]:
import org.apache.spark.sql.SparkSession
import com.flightdelay.config.{AppConfiguration, ConfigurationLoader, ExperimentConfig}
import com.flightdelay.data.loaders.FlightDataLoader

//Env Configuration
val args: Array[String] = Array("jupyter")

val spark = SparkSession.builder()
  .config(sc.getConf)
  .getOrCreate()

// Rendre la session Spark implicite
implicit val session = spark
implicit val configuration: AppConfiguration = ConfigurationLoader.loadConfiguration(args)
implicit val experiment: ExperimentConfig = configuration.experiments(0)

args = Array(jupyter)
spark = org.apache.spark.sql.SparkSession@53a15a09
session = org.apache.spark.sql.SparkSession@53a15a09
configuration = AppConfiguration(local,CommonConfig(42,DataConfig(/home/jovyan/work/data,FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Flights/201201*.csv),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Weather/20101*.txt),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/wban_airport_timezone.csv)),OutputConfig(/home/jovyan/work/output,FileConfig(/home/jovyan/work/output...


AppConfiguration(local,CommonConfig(42,DataConfig(/home/jovyan/work/data,FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Flights/201201*.csv),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Weather/20101*.txt),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/wban_airport_timezone.csv)),OutputConfig(/home/jovyan/work/output,FileConfig(/home/jovyan/work/output...

## Chargement des données

In [3]:
val flightDFPath = s"${configuration.common.output.basePath}/common/data/processed_flights.parquet"
val flightDF = spark.read.parquet(flightDFPath)

println("Flight DF Count: ", flightDF.count())

(Flight DF Count: ,1928396)


flightDFPath = /home/jovyan/work/output/common/data/processed_flights.parquet
flightDF = [DEST_WBAN: string, ORIGIN_WBAN: string ... 118 more fields]


[DEST_WBAN: string, ORIGIN_WBAN: string ... 118 more fields]

In [4]:
val weatherDFPath = s"${configuration.common.output.basePath}/common/data/processed_weather.parquet"
val weatherDF = spark.read.parquet(weatherDFPath)

println("Weather DF Count: ", weatherDF.count())

(Weather DF Count: ,755247)


weatherDFPath = /home/jovyan/work/output/common/data/processed_weather.parquet
weatherDF = [WBAN: string, Date: date ... 92 more fields]


[WBAN: string, Date: date ... 92 more fields]

## Nettoyage

### On garde les vols pour lesquels nous avons de la météo à l'aéroport de départ et d'arrivée 

In [5]:
// Spark 3.5.3 / Scala 2.12
import org.apache.spark.sql.functions._

/** 1) WBAN valides côté météo (distinct, non nuls, nettoyés) */
val weatherStations = weatherDF
  .select(trim(col("WBAN")).as("WBAN"))
  .where(col("WBAN").isNotNull && length(col("WBAN")) > 0)
  .distinct()
  .repartition(200)        // optionnel: ajuste selon cluster
  .cache()

/** 2) Prépare les colonnes WBAN côté vols (nettoyage basique) */
val flightsWBAN = flightDF
  .withColumn("ORIGIN_WBAN", trim(col("ORIGIN_WBAN")))
  .withColumn("DEST_WBAN",   trim(col("DEST_WBAN")))

/** 3) Comptage avant filtrage (optionnel) */
val countBefore = flightsWBAN.count()

/** 4) Garde uniquement les vols dont ORIGIN_WBAN existe dans la météo */
val originStations = weatherStations
  .select(col("WBAN").as("ORIGIN_WBAN"))

val flightsHasOrigin = flightsWBAN
  .join(originStations, Seq("ORIGIN_WBAN"), "left_semi")

/** 5) Puis garde uniquement ceux dont DEST_WBAN existe aussi */
val destStations = weatherStations
  .select(col("WBAN").as("DEST_WBAN"))

val flightDF_filtered = flightsHasOrigin
  .join(destStations, Seq("DEST_WBAN"), "left_semi")
  .cache()

/** 6) Comptage après filtrage et petit bilan */
val countAfter  = flightDF_filtered.count()
println(s"[WBAN filter] Flights before: $countBefore, after: $countAfter, removed: ${countBefore - countAfter}")

/** 7) (Optionnel) Sauvegarde */
val outPath = s"${configuration.common.output.basePath}/common/data/flight_filtered_has_weather.parquet"
flightDF_filtered
  .repartition(400)        // optionnel: selon volume/cluster
  .write.mode("overwrite")
  .parquet(outPath)
println(s"Saved filtered flights to: $outPath")

[WBAN filter] Flights before: 1928396, after: 1928396, removed: 0
Saved filtered flights to: /home/jovyan/work/output/common/data/flight_filtered_has_weather.parquet


weatherStations = [WBAN: string]
flightsWBAN = [DEST_WBAN: string, ORIGIN_WBAN: string ... 118 more fields]
countBefore = 1928396
originStations = [ORIGIN_WBAN: string]
flightsHasOrigin = [ORIGIN_WBAN: string, DEST_WBAN: string ... 118 more fields]
destStations = [DEST_WBAN: string]
flightDF_filtered = [DEST_WBAN: string, ORIGIN_WBAN: string ... 118 more fields]
countAfter = 1928396
outPath = /home/jovyan/work/output/common/data/flight_filtered_has_weather.parquet


/home/jovyan/work/output/common/data/flight_filtered_has_weather.parquet

### On garde la météo qui est associé à un aéroport

In [6]:
// Spark 3.5.3 / Scala 2.12
import org.apache.spark.sql.functions._
import org.apache.spark.sql.DataFrame

/** 1) WBAN référencés par au moins un vol (origine ou destination) */
val flightWBANs: DataFrame =
  flightDF
    .select(trim(col("ORIGIN_WBAN")).as("WBAN"))
    .where(col("WBAN").isNotNull && length(col("WBAN")) > 0)
    .unionByName(
      flightDF
        .select(trim(col("DEST_WBAN")).as("WBAN"))
        .where(col("WBAN").isNotNull && length(col("WBAN")) > 0)
    )
    .distinct()
    .repartition(200) // optionnel selon cluster/volume
    .cache()

/** 2) Comptage avant (optionnel) */
val weatherBefore = weatherDF.count()

/** 3) Filtrer la météo: garder uniquement les WBAN utilisés par les vols */
val weatherDF_pruned =
  weatherDF
    .withColumn("WBAN", trim(col("WBAN")))
    .where(col("WBAN").isNotNull && length(col("WBAN")) > 0)
    .join(flightWBANs, Seq("WBAN"), "left_semi")
    .cache()

/** 4) Comptage après et bilan */
val weatherAfter = weatherDF_pruned.count()
println(s"[Weather prune] Weather rows before: $weatherBefore, after: $weatherAfter, removed: ${weatherBefore - weatherAfter}")

/** 5) (Optionnel) Sauvegarde */
val outPathWeather = s"${configuration.common.output.basePath}/common/data/weather_pruned_by_flights.parquet"
weatherDF_pruned
  .repartition(400) // optionnel
  .write.mode("overwrite")
  .parquet(outPathWeather)
println(s"Saved pruned weather to: $outPathWeather")

[Weather prune] Weather rows before: 755247, after: 755247, removed: 0
Saved pruned weather to: /home/jovyan/work/output/common/data/weather_pruned_by_flights.parquet


flightWBANs = [WBAN: string]
weatherBefore = 755247
weatherDF_pruned = [WBAN: string, Date: date ... 92 more fields]
weatherAfter = 755247
outPathWeather = /home/jovyan/work/output/common/data/weather_pruned_by_flights.parquet


/home/jovyan/work/output/common/data/weather_pruned_by_flights.parquet

## Restreindre les vols aux mois couverts par la météo

In [7]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

val flightsCoveredMonths =
  flightDF_filtered
    .withColumn("month_utc", date_format(col("UTC_FL_DATE"), "yyyy-MM"))

val weatherMonths =
  weatherDF_pruned
    .withColumn("month_utc", date_format(col("Date"), "yyyy-MM"))
    .select("month_utc").distinct()

val flightDF_mCovered =
  flightsCoveredMonths.join(weatherMonths, Seq("month_utc"), "left_semi")

val kept = flightDF_mCovered.count()
val total = flightDF_filtered.count()
println(f"[Step1] Flights in covered months: $kept / $total (${kept.toDouble * 100.0 / total}%.2f%%)")

[Step1] Flights in covered months: 1928396 / 1928396 (100.00%)


flightsCoveredMonths = [DEST_WBAN: string, ORIGIN_WBAN: string ... 119 more fields]
weatherMonths = [month_utc: string]
flightDF_mCovered = [month_utc: string, DEST_WBAN: string ... 119 more fields]
kept = 1928396
total = 1928396


1928396

## Jointure

In [8]:
val flightData = flightDF_mCovered
val weatherData = weatherDF_pruned

println("Flight for join count ->", flightData.count())
println("Weather for join count ->", weatherData.count())

(Flight for join count ->,1928396)


flightData = [month_utc: string, DEST_WBAN: string ... 119 more fields]
weatherData = [WBAN: string, Date: date ... 92 more fields]


(Weather for join count ->,755247)


[WBAN: string, Date: date ... 92 more fields]

In [9]:
// =====================================================================
// Flight × Weather en DataFrame (Map → Hash partition → Reduce)
// - Fenêtre 12h avant départ (Wo_*) et 12h avant arrivée (Wd_*)
// - Gestion veille via relHour (duplication J et J+1)
// - Un seul job global avec Metrics.withJob
// =====================================================================
import org.apache.spark.sql.{DataFrame, Row, Column}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

// ------------------------
// 0) Objet Metrics (inchangé)
// ------------------------
object Metrics {
  def withJob[T](id: String, desc: String)(body: => T): T = {
    val sc = spark.sparkContext
    sc.setJobGroup(id, desc, interruptOnCancel = false)
    val t0 = System.nanoTime()
    val res = body
    val dtMs = (System.nanoTime() - t0) / 1e6
    println(f"[METRIC][$id] $desc took ${dtMs}%.2f ms")
    sc.clearJobGroup()
    res
  }
  import org.apache.spark.storage.StorageLevel
  def persistCount(id: String, desc: String, rdd: org.apache.spark.rdd.RDD[_], lvl: StorageLevel = StorageLevel.MEMORY_ONLY): Long = {
    rdd.persist(lvl); val n = withJob(id, desc) { rdd.count() }
    println(s"[METRIC][$id] records=$n partitions=${rdd.getNumPartitions}"); n
  }
  def persistCountDF(id: String, desc: String, df: DataFrame, lvl: StorageLevel = StorageLevel.MEMORY_ONLY): Long = {
    df.persist(lvl); val n = withJob(id, desc) { df.count() }
    println(s"[METRIC][$id] records=$n partitions=${df.rdd.getNumPartitions}"); n
  }
}

// =====================================================================
// 1) Jointure complète exécutée sous un SEUL job
// =====================================================================


// ------------------------
// Paramètres de partitions "reducers"
// ------------------------

Metrics.withJob("flight-joins-df", "Build flights joins DataFrame") {
    val cores = spark.sparkContext.defaultParallelism
    def pickParts(mult: Double, minAbs: Int, maxAbs: Int): Int =
    math.min(maxAbs, math.max(minAbs, math.round(cores * mult).toInt))
    
    val numPartsOrigin = pickParts(3.3, 32, 128) // ≈ 40
    val numPartsDest   = pickParts(5.2, 48, 192) // ≈ 64
    spark.conf.set("spark.sql.shuffle.partitions", numPartsDest) // borne haute DF
    
    // ------------------------
    // Sélection des colonnes utiles (Map)
    // ------------------------
    //flightDF_filtered
    val flights = flightDF_mCovered.select(
    col("feature_flight_unique_id").as("flightId"),
    col("ORIGIN_WBAN"), col("DEST_WBAN"),
    col("UTC_FL_DATE").cast(DateType).as("DEP_DATE"),
    col("UTC_ARR_DATE").cast(DateType).as("ARR_DATE"),
    col("UTC_CRS_DEP_TIME").as("DEP_TIME_HHMM"),
    col("UTC_ARR_TIME").as("ARR_TIME_HHMM"),
    col("*")
    )
    
    val weather = weatherDF_pruned.select(
    trim(col("WBAN")).as("WBAN"),
    col("Date").cast(DateType).as("WDATE"),
    col("Time").as("WTIME_HHMM"),
    col("Visibility").as("vis"),
    col("WindSpeed").as("ws"),
    col("WindDirection").as("wd"),
    col("DryBulbCelsius").as("tempC"),
    col("SeaLevelPressure").as("slp"),
    col("HourlyPrecip").as("precip"),
    col("feature_weather_severity_index"),
    col("feature_flight_category_ordinal")
    ).where(col("WBAN").isNotNull && length(col("WBAN")) > 0 && col("WDATE").isNotNull)
    
    // ------------------------
    // Utilitaire HHMM -> hour [0..23] (sans UDF) (Map)
    // ------------------------
    def hhmmHourCol(c: Column): Column = {
    val s  = regexp_replace(c.cast("string"), ":", "")
    val p4 = lpad(s, 4, "0")
    (substring(p4, 1, 2).cast("int") % 24)
    }
    
    // ------------------------
    // Préparation météo avec relHour + duplication J/J+1 (Map)
    // ------------------------
    val weatherWithHour = weather
    .withColumn("hour", hhmmHourCol(col("WTIME_HHMM")))
    .na.fill(Map("hour" -> -1))
    
    val meteoSameDay = weatherWithHour
    .withColumn("relHour", col("hour"))
    .withColumn("DATE", col("WDATE"))
    
    val meteoNextDay = weatherWithHour
    .withColumn("relHour", col("hour") - lit(24))
    .withColumn("DATE", date_add(col("WDATE"), 1))
    
    val weatherRel = meteoSameDay.unionByName(meteoNextDay)
    .filter(col("relHour").between(-24, 23))
    
    // ------------------------
    // Reduce météo par clé → Map relHour -> struct (Reduce)
    // ------------------------
    val weatherStruct =
    struct(
      col("relHour").as("hour"),
      col("WBAN"), col("WDATE"), col("WTIME_HHMM"),
      col("vis"), col("ws"), col("wd"),
      col("tempC"), col("slp"), col("precip"),
      col("feature_weather_severity_index"),
      col("feature_flight_category_ordinal")
    )
    
    val weatherByKey: DataFrame =
    weatherRel
      .groupBy(col("WBAN"), col("DATE"))
      .agg(
        map_from_entries(
          collect_list(struct(col("relHour"), weatherStruct))
        ).as("wmap")
      )
    // (WBAN, DATE, wmap: map<int, struct{hour,WBAN,WDATE,WTIME_HHMM,...}>)
    
    // ------------------------
    // JOIN #1 — ORIGIN (Partition = hash(ORIGIN_WBAN, DEP_DATE))
    // ------------------------
    val flightsDep = flights
    .withColumn("depHour", coalesce(hhmmHourCol(col("DEP_TIME_HHMM")), lit(0)))
    
    val originPre = flightsDep
    .repartition(numPartsOrigin, col("ORIGIN_WBAN"), col("DEP_DATE")) // <-- Hash partition explicite
    .join(
      weatherByKey.hint("shuffle_hash"),
      col("ORIGIN_WBAN") === weatherByKey("WBAN") &&
      col("DEP_DATE")    === weatherByKey("DATE"),
      "left"
    )
    .drop(weatherByKey("WBAN")).drop(weatherByKey("DATE"))
    
    val originWithWoArr = originPre
    .withColumn("Wo", expr("transform(sequence(1, 12), i -> element_at(wmap, depHour - i))"))
    .drop("wmap")
    
    val woCols = (0 until 12).map(i => col("Wo").getItem(i).as(s"Wo_h${i+1}"))
    val originDF = originWithWoArr
    .select(col("*") +: woCols: _*)
    .drop("Wo")
    .persist()
    
    // ------------------------
    // JOIN #2 — DEST (Partition = hash(DEST_WBAN, ARR_DATE))
    // ------------------------
    val flightsArr = originDF
    .withColumn("arrHour", coalesce(hhmmHourCol(col("ARR_TIME_HHMM")), lit(0)))
    
    val destPre = flightsArr
    .repartition(numPartsDest, col("DEST_WBAN"), col("ARR_DATE"))     // <-- Hash partition explicite
    .join(
      weatherByKey.hint("shuffle_hash"),
      col("DEST_WBAN") === weatherByKey("WBAN") &&
      col("ARR_DATE")  === weatherByKey("DATE"),
      "left"
    )
    .drop(weatherByKey("WBAN")).drop(weatherByKey("DATE"))
    
    val destWithWdArr = destPre
    .withColumn("Wd", expr("transform(sequence(1, 12), i -> element_at(wmap, arrHour - i))"))
    .drop("wmap")
    
    val wdCols = (0 until 12).map(i => col("Wd").getItem(i).as(s"Wd_h${i+1}"))
    val joinedDF = destWithWdArr
    .select(col("*") +: wdCols: _*)
    .drop("Wd")
    .persist()
    
    // ------------------------
    // Action finale (déclenche l'exécution)
    // ------------------------
    println(s"Rows after ORIGIN join: ${originDF.count()}, rows after DEST join: ${joinedDF.count()}")
    
}

defined object Metrics


Rows after ORIGIN join: 3908461, rows after DEST join: 3908461
[METRIC][flight-joins-df] Build flights joins DataFrame took 67511.60 ms


In [10]:
joinedDF.show(1, 1000, true)

Unknown Error: <console>:66: error: not found: value joinedDF
       joinedDF.show(1, 1000, true)
       ^
