# Data Jointure V2

## Configuration

In [2]:
%AddJar file:///home/jovyan/work/apps/Emiasd-Flight-Data-Analysis.jar

Starting download from file:///home/jovyan/work/apps/Emiasd-Flight-Data-Analysis.jar
Finished download of Emiasd-Flight-Data-Analysis.jar
Using cached version of Emiasd-Flight-Data-Analysis.jar


In [3]:
import org.apache.spark.sql.SparkSession
import com.flightdelay.config.{AppConfiguration, ConfigurationLoader, ExperimentConfig}
import com.flightdelay.data.loaders.FlightDataLoader

//Env Configuration
val args: Array[String] = Array("jupyter")

val spark = SparkSession.builder()
  .config(sc.getConf)
  .getOrCreate()

// Rendre la session Spark implicite
implicit val session = spark
implicit val configuration: AppConfiguration = ConfigurationLoader.loadConfiguration(args)
implicit val experiment: ExperimentConfig = configuration.experiments(0)

args = Array(jupyter)
spark = org.apache.spark.sql.SparkSession@9ae9e27
session = org.apache.spark.sql.SparkSession@9ae9e27
configuration = AppConfiguration(local,CommonConfig(42,DataConfig(/home/jovyan/work/data,FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Flights/201201*.csv),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Weather/20101*.txt),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/wban_airport_timezone.csv)),OutputConfig(/home/jovyan/work/output,FileConfig(/home/jovyan/work/output/d...


AppConfiguration(local,CommonConfig(42,DataConfig(/home/jovyan/work/data,FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Flights/201201*.csv),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Weather/20101*.txt),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/wban_airport_timezone.csv)),OutputConfig(/home/jovyan/work/output,FileConfig(/home/jovyan/work/output/d...

## Chargement des données

In [4]:
val flightDFPath = s"${configuration.common.output.basePath}/common/data/processed_flights.parquet"
val flightDF = spark.read.parquet(flightDFPath)

println("Flight DF Count: ", flightDF.count())

(Flight DF Count: ,461369)


flightDFPath = /home/jovyan/work/output/common/data/processed_flights.parquet
flightDF = [FL_DATE: date, OP_CARRIER_AIRLINE_ID: int ... 135 more fields]


[FL_DATE: date, OP_CARRIER_AIRLINE_ID: int ... 135 more fields]

In [5]:
flightDF.printSchema

root
 |-- FL_DATE: date (nullable = true)
 |-- OP_CARRIER_AIRLINE_ID: integer (nullable = true)
 |-- OP_CARRIER_FL_NUM: integer (nullable = true)
 |-- ORIGIN_AIRPORT_ID: integer (nullable = true)
 |-- DEST_AIRPORT_ID: integer (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- ARR_DELAY_NEW: double (nullable = true)
 |-- CRS_ELAPSED_TIME: double (nullable = true)
 |-- WEATHER_DELAY: double (nullable = true)
 |-- NAS_DELAY: double (nullable = true)
 |-- D4: integer (nullable = true)
 |-- D3: integer (nullable = true)
 |-- D1: integer (nullable = true)
 |-- D2_15: integer (nullable = true)
 |-- D2_30: integer (nullable = true)
 |-- D2_45: integer (nullable = true)
 |-- D2_60: integer (nullable = true)
 |-- D2_90: integer (nullable = true)
 |-- ORIGIN_WBAN: string (nullable = true)
 |-- ORIGIN_TIMEZONE: integer (nullable = true)
 |-- DEST_WBAN: string (nullable = true)
 |-- DEST_TIMEZONE: integer (nullable = true)
 |-- UTC_CRS_DEP_TIME: string (nullable = true)
 |-- UTC_AR

In [6]:
val weatherDFPath = s"${configuration.common.output.basePath}/common/data/processed_weather.parquet"
val weatherDF = spark.read.parquet(weatherDFPath)

println("Weather DF Count: ", weatherDF.count())

(Weather DF Count: ,1414384)


weatherDFPath = /home/jovyan/work/output/common/data/processed_weather.parquet
weatherDF = [WBAN: string, Date: date ... 92 more fields]


[WBAN: string, Date: date ... 92 more fields]

In [9]:
weatherDF.printSchema

root
 |-- WBAN: string (nullable = true)
 |-- Date: date (nullable = true)
 |-- Time: string (nullable = true)
 |-- StationType: integer (nullable = true)
 |-- SkyCondition: string (nullable = true)
 |-- SkyConditionFlag: string (nullable = true)
 |-- Visibility: double (nullable = true)
 |-- VisibilityFlag: string (nullable = true)
 |-- WeatherType: string (nullable = true)
 |-- WeatherTypeFlag: string (nullable = true)
 |-- DryBulbFarenheit: double (nullable = true)
 |-- DryBulbFarenheitFlag: string (nullable = true)
 |-- DryBulbCelsius: double (nullable = true)
 |-- DryBulbCelsiusFlag: string (nullable = true)
 |-- WetBulbFarenheit: double (nullable = true)
 |-- WetBulbFarenheitFlag: string (nullable = true)
 |-- WetBulbCelsius: double (nullable = true)
 |-- WetBulbCelsiusFlag: string (nullable = true)
 |-- DewPointFarenheit: double (nullable = true)
 |-- DewPointFarenheitFlag: string (nullable = true)
 |-- DewPointCelsius: double (nullable = true)
 |-- DewPointCelsiusFlag: string (

## Claude - RDD Implemntation

In [16]:
import org.apache.spark.sql.{DataFrame, SparkSession, Row}
import org.apache.spark.sql.types._
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partitioner}
import java.sql.Date
import scala.collection.mutable

// ============================================
// PARTITIONNEUR (inchangé)
// ============================================

class AirportDatePartitioner(val numPartitions: Int) extends Partitioner with Serializable {
  
  override def getPartition(key: Any): Int = key match {
    case (wban: String, date: String, tag: String) =>
      val hash = wban.hashCode + date.hashCode
      Math.abs(hash) % numPartitions
    case _ =>
      throw new IllegalArgumentException(s"Unexpected key type: ${key.getClass}")
  }
  
  override def equals(other: Any): Boolean = other match {
    case p: AirportDatePartitioner => p.numPartitions == numPartitions
    case _ => false
  }
  
  override def hashCode: Int = numPartitions
}

// ============================================
// UTILS (inchangé)
// ============================================

object JoinUtils extends Serializable {
  
  def needsPreviousDayData(timeStr: String): Boolean = {
    try {
      if (timeStr == null) return false
      
      val hour = if (timeStr.contains(":")) {
        timeStr.split(":")(0).toInt
      } else if (timeStr.length >= 2) {
        timeStr.take(2).toInt
      } else {
        return false
      }
      hour < 12
    } catch {
      case _: Exception => false
    }
  }
  
  def addDays(date: Date, days: Int): Date = {
    val cal = java.util.Calendar.getInstance()
    cal.setTime(date)
    cal.add(java.util.Calendar.DAY_OF_MONTH, days)
    new Date(cal.getTimeInMillis)
  }
  
  def parseWeatherTimestamp(weatherRow: Row): Long = {
    try {
      val date = weatherRow.getAs[Date]("Date")
      val time = weatherRow.getAs[String]("Time")
      
      if (date == null || time == null) return 0L
      
      val hourMinute = if (time.contains(":")) {
        time.split(":")
      } else if (time.length == 4) {
        Array(time.take(2), time.drop(2))
      } else {
        return 0L
      }
      
      val cal = java.util.Calendar.getInstance()
      cal.setTime(date)
      cal.set(java.util.Calendar.HOUR_OF_DAY, hourMinute(0).toInt)
      cal.set(java.util.Calendar.MINUTE, hourMinute(1).toInt)
      cal.set(java.util.Calendar.SECOND, 0)
      cal.set(java.util.Calendar.MILLISECOND, 0)
      
      cal.getTimeInMillis
    } catch {
      case _: Exception => 0L
    }
  }
  
  def parseFlightTimestamp(date: Date, time: String): Long = {
    try {
      if (date == null || time == null) return 0L
      
      val hourMinute = if (time.contains(":")) {
        time.split(":")
      } else if (time.length == 4) {
        Array(time.take(2), time.drop(2))
      } else {
        return 0L
      }
      
      val cal = java.util.Calendar.getInstance()
      cal.setTime(date)
      cal.set(java.util.Calendar.HOUR_OF_DAY, hourMinute(0).toInt)
      cal.set(java.util.Calendar.MINUTE, hourMinute(1).toInt)
      cal.set(java.util.Calendar.SECOND, 0)
      cal.set(java.util.Calendar.MILLISECOND, 0)
      
      cal.getTimeInMillis
    } catch {
      case _: Exception => 0L
    }
  }
  
  def extract12HourlyObservations(
    weatherBuffer: mutable.TreeMap[Long, Row],
    scheduledTimestamp: Long
  ): Array[Row] = {
    
    val observations = new Array[Row](12)
    
    for (i <- 0 until 12) {
      val targetTimestamp = scheduledTimestamp - (i * 3600000L)
      val closestEntry = weatherBuffer.to(targetTimestamp).lastOption
      
      closestEntry match {
        case Some((_, weatherRow)) => observations(i) = weatherRow
        case None => observations(i) = null
      }
    }
    
    observations
  }
}

// ============================================
// CRÉATION DE SCHÉMAS POUR LES ROWS JOINTES
// ============================================

object SchemaUtils {
  
  def createOriginJoinedSchema(flightSchema: StructType): StructType = {
    // Ajouter un champ pour les observations météo d'origine
    flightSchema.add("origin_weather_observations", 
      ArrayType(StructType(Seq(
        StructField("weather_data", StringType, nullable = true)
      )))
    )
  }
  
  def createFinalJoinedSchema(originJoinedSchema: StructType): StructType = {
    // Ajouter un champ pour les observations météo de destination
    originJoinedSchema.add("dest_weather_observations",
      ArrayType(StructType(Seq(
        StructField("weather_data", StringType, nullable = true)
      )))
    )
  }
}

// ============================================
// PHASE MAP - Origin (avec identifiant unique)
// ============================================

def mapPhaseOrigin(flightDF: DataFrame, weatherDF: DataFrame): RDD[((String, String, String), (Row, String))] = {
  
  val weatherMapped = weatherDF.rdd.map { row =>
    val wban = row.getAs[String]("WBAN")
    val date = row.getAs[Date]("Date").toString
    ((wban, date, "OT"), (row, "weather"))
  }
  
  // IMPORTANT: Ajouter un identifiant unique à chaque vol
  val flightMapped = flightDF.rdd.flatMap { row =>
    val originWban = row.getAs[String]("ORIGIN_WBAN")
    val flightDate = row.getAs[Date]("UTC_FL_DATE")
    val depTime = row.getAs[String]("UTC_CRS_DEP_TIME")
    
    // Créer un ID unique basé sur les colonnes du vol
    val flightId = s"${row.getAs[String]("feature_flight_unique_id")}"
    
    val results = mutable.ArrayBuffer[((String, String, String), (Row, String))]()
    
    results += (((originWban, flightDate.toString, "FT"), (row, flightId)))
    
    if (JoinUtils.needsPreviousDayData(depTime)) {
      val prevDate = JoinUtils.addDays(flightDate, -1)
      results += (((originWban, prevDate.toString, "FT"), (row, flightId)))
    }
    
    results
  }
  
  weatherMapped.union(flightMapped)
}

// ============================================
// PHASE REDUCE - Origin (avec déduplication)
// ============================================

def reducePhaseOrigin(sortedRDD: RDD[((String, String, String), (Row, String))]): RDD[(Row, StructType)] = {
  
  sortedRDD.mapPartitions { partition =>
    
    val results = mutable.ArrayBuffer[(Row, StructType)]()
    
    val grouped = partition.toSeq.groupBy { case ((wban, date, _), _) =>
      (wban, date)
    }
    
    grouped.foreach { case ((wban, date), records) =>
      
      val weatherBuffer = mutable.TreeMap[Long, Row]()
      val flightMap = mutable.Map[String, Row]()  // Map pour déduplication
      
      records.foreach { case ((_, _, tag), (row, idOrType)) =>
        if (tag == "OT") {
          val timestamp = JoinUtils.parseWeatherTimestamp(row)
          if (timestamp > 0) {
            weatherBuffer.put(timestamp, row)
          }
        } else {
          // Stocker le vol avec son ID unique
          flightMap.put(idOrType, row)
        }
      }
      
      // Traiter chaque vol unique
      flightMap.values.foreach { flightRow =>
        val flightDate = flightRow.getAs[Date]("UTC_FL_DATE")
        val depTime = flightRow.getAs[String]("UTC_CRS_DEP_TIME")
        val depTimestamp = JoinUtils.parseFlightTimestamp(flightDate, depTime)
        
        val weatherObservations = JoinUtils.extract12HourlyObservations(
          weatherBuffer, 
          depTimestamp
        )
        
        // Créer une Row avec schéma
        val flightSeq = flightRow.toSeq
        val weatherSeq = weatherObservations.toSeq
        
        // Conserver le schéma original
        val originalSchema = flightRow.schema
        
        // Créer la nouvelle Row avec les données
        val joinedSeq = flightSeq :+ weatherSeq
        val joinedRow = Row.fromSeq(joinedSeq)
        
        results += ((joinedRow, originalSchema))
      }
    }
    
    results.iterator
  }
}

// ============================================
// PHASE MAP - Destination (avec accès par index)
// ============================================

def mapPhaseDestination(
  firstJoinResultRDD: RDD[(Row, StructType)], 
  weatherDF: DataFrame
): RDD[((String, String, String), (Row, StructType, String))] = {
  
  val weatherMapped = weatherDF.rdd.map { row =>
    val wban = row.getAs[String]("WBAN")
    val date = row.getAs[Date]("Date").toString
    ((wban, date, "OT"), (row, weatherDF.schema, "weather"))
  }
  
  val joinedMapped = firstJoinResultRDD.flatMap { case (row, schema) =>
    // Utiliser l'index pour accéder aux champs
    val destWbanIdx = schema.fieldIndex("DEST_WBAN")
    val arrDateIdx = schema.fieldIndex("UTC_ARR_DATE")
    val arrTimeIdx = schema.fieldIndex("UTC_ARR_TIME")
    val flightIdIdx = schema.fieldIndex("feature_flight_unique_id")
    
    val destWban = row.getString(destWbanIdx)
    val arrDate = row.getDate(arrDateIdx)
    val arrTime = row.getString(arrTimeIdx)
    val flightId = row.getString(flightIdIdx)
    
    val results = mutable.ArrayBuffer[((String, String, String), (Row, StructType, String))]()
    
    results += (((destWban, arrDate.toString, "FT"), (row, schema, flightId)))
    
    if (JoinUtils.needsPreviousDayData(arrTime)) {
      val prevDate = JoinUtils.addDays(arrDate, -1)
      results += (((destWban, prevDate.toString, "FT"), (row, schema, flightId)))
    }
    
    results
  }
  
  weatherMapped.union(joinedMapped)
}

// ============================================
// PHASE REDUCE - Destination (avec déduplication)
// ============================================

def reducePhaseDestination(
  sortedRDD: RDD[((String, String, String), (Row, StructType, String))]
): RDD[(Row, StructType)] = {
  
  sortedRDD.mapPartitions { partition =>
    
    val results = mutable.ArrayBuffer[(Row, StructType)]()
    
    val grouped = partition.toSeq.groupBy { case ((wban, date, _), _) =>
      (wban, date)
    }
    
    grouped.foreach { case ((wban, date), records) =>
      
      val weatherBuffer = mutable.TreeMap[Long, Row]()
      val flightMap = mutable.Map[String, (Row, StructType)]()
      
      records.foreach { case ((_, _, tag), (row, schema, idOrType)) =>
        if (tag == "OT") {
          val timestamp = JoinUtils.parseWeatherTimestamp(row)
          if (timestamp > 0) {
            weatherBuffer.put(timestamp, row)
          }
        } else {
          flightMap.put(idOrType, (row, schema))
        }
      }
      
      flightMap.values.foreach { case (joinedRow, schema) =>
        val arrDateIdx = schema.fieldIndex("UTC_ARR_DATE")
        val arrTimeIdx = schema.fieldIndex("UTC_ARR_TIME")
        
        val arrDate = joinedRow.getDate(arrDateIdx)
        val arrTime = joinedRow.getString(arrTimeIdx)
        val arrTimestamp = JoinUtils.parseFlightTimestamp(arrDate, arrTime)
        
        val weatherObservations = JoinUtils.extract12HourlyObservations(
          weatherBuffer, 
          arrTimestamp
        )
        
        val currentSeq = joinedRow.toSeq
        val destWeatherSeq = weatherObservations.toSeq
        
        val finalSeq = currentSeq :+ destWeatherSeq
        val finalRow = Row.fromSeq(finalSeq)
        
        results += ((finalRow, schema))
      }
    }
    
    results.iterator
  }
}

// ============================================
// PARTITION ET SORT (inchangé)
// ============================================

def partitionAndSort[T](
  mappedRDD: RDD[((String, String, String), T)],
  numPartitions: Int
): RDD[((String, String, String), T)] = {
  
  val partitioner = new AirportDatePartitioner(numPartitions)
  
  implicit val keyOrdering: Ordering[(String, String, String)] = 
    new Ordering[(String, String, String)] {
      def compare(x: (String, String, String), y: (String, String, String)): Int = {
        val cmpWban = x._1.compareTo(y._1)
        if (cmpWban != 0) return cmpWban
        
        val cmpDate = x._2.compareTo(y._2)
        if (cmpDate != 0) return cmpDate
        
        x._3.compareTo(y._3)
      }
    }
  
  mappedRDD.repartitionAndSortWithinPartitions(partitioner)
}

// ============================================
// FONCTION PRINCIPALE
// ============================================

def performImprovedJoin(
  flightDF: DataFrame,
  weatherDF: DataFrame,
  numPartitions: Int = 200
): RDD[(Row, StructType)] = {
  
  println("=== PREMIÈRE JOINTURE (ORIGIN AIRPORT) ===")
  println("Phase MAP...")
  val mappedOrigin = mapPhaseOrigin(flightDF, weatherDF)
  
  println("Phase PARTITION + SORT...")
  val sortedOrigin = partitionAndSort(mappedOrigin, numPartitions)
  
  println("Phase REDUCE...")
  val firstJoinResult = reducePhaseOrigin(sortedOrigin)
  firstJoinResult.cache()
  val count1 = firstJoinResult.count()
  println(s"Nombre de vols après première jointure: $count1")
  
  println("\n=== DEUXIÈME JOINTURE (DESTINATION AIRPORT) ===")
  println("Phase MAP...")
  val mappedDest = mapPhaseDestination(firstJoinResult, weatherDF)
  
  println("Phase PARTITION + SORT...")
  val sortedDest = partitionAndSort(mappedDest, numPartitions)
  
  println("Phase REDUCE...")
  val finalResult = reducePhaseDestination(sortedDest)
  val count2 = finalResult.count()
  println(s"Nombre de vols dans résultat final: $count2")
  
  finalResult
}

lastException = null


Unknown Error: <console>:453: error: value repartitionAndSortWithinPartitions is not a member of org.apache.spark.rdd.RDD[((String, String, String), T)]
         mappedRDD.repartitionAndSortWithinPartitions(partitioner)
                   ^


In [15]:
// Mesurer le temps
val startTime = System.currentTimeMillis()

// Effectuer la jointure
val result = performImprovedJoin(
  flightDF, 
  weatherDF, 
  numPartitions = 200
)

// Forcer l'exécution
val count = result.count()

val endTime = System.currentTimeMillis()
val duration = (endTime - startTime) / 1000.0

println(s"\n=== RÉSULTATS ===")
println(s"Nombre total de vols traités: $count")
println(s"Temps d'exécution: $duration secondes")


=== PREMIÈRE JOINTURE (ORIGIN AIRPORT) ===
Phase MAP...
Phase PARTITION + SORT...
Phase REDUCE...
Nombre de vols après première jointure: 574553

=== DEUXIÈME JOINTURE (DESTINATION AIRPORT) ===
Phase MAP...
Phase PARTITION + SORT...
Phase REDUCE...


org.apache.spark.SparkException: Job aborted due to stage failure: Task 183 in stage 11.0 failed 1 times, most recent failure: Lost task 183.0 in stage 11.0 (TID 265) (jupyter-spark executor driver): org.apache.spark.SparkUnsupportedOperationException: fieldIndex on a Row without schema is undefined.
	at org.apache.spark.sql.errors.DataTypeErrors$.fieldIndexOnRowWithoutSchemaError(DataTypeErrors.scala:269)
	at org.apache.spark.sql.Row.fieldIndex(Row.scala:381)
	at org.apache.spark.sql.Row.fieldIndex$(Row.scala:380)
	at org.apache.spark.sql.catalyst.expressions.GenericRow.fieldIndex(rows.scala:27)
	at org.apache.spark.sql.Row.getAs(Row.scala:372)
	at org.apache.spark.sql.Row.getAs$(Row.scala:372)
	at org.apache.spark.sql.catalyst.expressions.GenericRow.getAs(rows.scala:27)
	at $line58.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.$anonfun$mapPhaseDestination$2(<console>:237)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)

Driver stacktrace: