# Flights DataLoader

In [1]:
%AddJar file:///home/jovyan/work/apps/Emiasd-Flight-Data-Analysis.jar

Starting download from file:///home/jovyan/work/apps/Emiasd-Flight-Data-Analysis.jar
Finished download of Emiasd-Flight-Data-Analysis.jar
Using cached version of Emiasd-Flight-Data-Analysis.jar


In [2]:
import org.apache.spark.sql.SparkSession
import com.flightdelay.config.{AppConfiguration, ConfigurationLoader}
import com.flightdelay.data.loaders.FlightDataLoader

//Env Configuration
val args: Array[String] = Array("juniper")

val spark = SparkSession.builder()
  .config(sc.getConf)
  .getOrCreate()

// Rendre la session Spark implicite
implicit val session = spark
implicit val configuration: AppConfiguration = ConfigurationLoader.loadConfiguration(args)

// Cellule 4: Test
val flightData = FlightDataLoader.loadFromConfiguration(false)



----------------------------------------------------------------------------------------------------------
--> [FlightDataLoader] Flight Data Loading - Start ...
--> 486133 loaded ...
root
 |-- FL_DATE: date (nullable = true)
 |-- OP_CARRIER_AIRLINE_ID: integer (nullable = true)
 |-- OP_CARRIER_FL_NUM: integer (nullable = true)
 |-- ORIGIN_AIRPORT_ID: integer (nullable = true)
 |-- DEST_AIRPORT_ID: integer (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- ARR_DELAY_NEW: double (nullable = true)
 |-- CANCELLED: double (nullable = true)
 |-- DIVERTED: double (nullable = true)
 |-- CRS_ELAPSED_TIME: double (nullable = true)
 |-- WEATHER_DELAY: double (nullable = true)
 |-- NAS_DELAY: double (nullable = true)

+----------+---------------------+-----------------+-----------------+---------------+------------+-------------+---------+--------+----------------+-------------+---------+
|   FL_DATE|OP_CARRIER_AIRLINE_ID|OP_CARRIER_FL_NUM|ORIGIN_AIRPORT_ID|DEST_AIRPORT_ID|CRS_

args = Array(local)
spark = org.apache.spark.sql.SparkSession@46d0cef9
session = org.apache.spark.sql.SparkSession@46d0cef9
configuration = AppConfiguration(local,DataConfig(/data,FileConfig(/data/FLIGHT-3Y/Flights/201201.csv),FileConfig(/data/FLIGHT-3Y/Weather/201201hourly.txt),FileConfig(/data/FLIGHT-3Y/wban_airport_timezone.csv)),OutputConfig(/output,FileConfig(/output/data),FileConfig(/output/model)))
flightFilePath = ../data/FLIGHT-3Y/Flights/201201.csv


flightData: org.apache.spark.sql.Dat...


../data/FLIGHT-3Y/Flights/201201.csv

In [3]:
import com.flightdelay.data.utils.DataQualityMetrics

val flightDataMetrics = DataQualityMetrics.metrics(flightData)
flightDataMetrics.show()

+--------------------+-----------+-------+----------+----------------+
|                name|   origType|colType| compRatio|nbDistinctValues|
+--------------------+-----------+-------+----------+----------------+
|             FL_DATE|   DateType|   date|       1.0|              31|
|OP_CARRIER_AIRLIN...|IntegerType|numeric|       1.0|              15|
|   OP_CARRIER_FL_NUM|IntegerType|numeric|       1.0|            6237|
|   ORIGIN_AIRPORT_ID|IntegerType|numeric|       1.0|             287|
|     DEST_AIRPORT_ID|IntegerType|numeric|       1.0|             287|
|        CRS_DEP_TIME|IntegerType|numeric|       1.0|            1153|
|       ARR_DELAY_NEW| DoubleType|numeric| 0.9833214|             549|
|           CANCELLED| DoubleType|numeric|       1.0|               2|
|            DIVERTED| DoubleType|numeric|       1.0|               2|
|    CRS_ELAPSED_TIME| DoubleType|numeric|       1.0|             419|
|       WEATHER_DELAY| DoubleType|numeric|0.14586131|             288|
|     

flightDataMetrics = [name: string, origType: string ... 3 more fields]


[name: string, origType: string ... 3 more fields]

In [7]:
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._

def explainArrDelayNewNulls(df: DataFrame): Unit = {
  // 0) Taille & nulls globaux
  val total = df.count()
  println(s"Total rows: $total")

  df.select(
      sum(when(col("ARR_DELAY_NEW").isNull, 1).otherwise(0)).as("nulls"),
      sum(when(col("ARR_DELAY_NEW").isNotNull, 1).otherwise(0)).as("non_nulls")
    )
    .withColumn("null_rate", col("nulls")/lit(total))
    .show(false)

  // 1) Hypothèse standard (dataset US BTS) :
  //    ARR_DELAY_NEW est NULL si le vol est annulé (CANCELLED=1) ou détourné (DIVERTED=1)
  val expectedNull = col("CANCELLED") === lit(1.0) || col("DIVERTED") === lit(1.0)

  df.select(
      sum(when(expectedNull && col("ARR_DELAY_NEW").isNull, 1).otherwise(0)).as("null_expected"),
      sum(when(!expectedNull && col("ARR_DELAY_NEW").isNull, 1).otherwise(0)).as("null_unexpected"),
      sum(when(expectedNull && col("ARR_DELAY_NEW").isNotNull, 1).otherwise(0)).as("notnull_unexpected")
    )
    .withColumn("null_expected_rate", col("null_expected")/lit(total))
    .withColumn("null_unexpected_rate", col("null_unexpected")/lit(total))
    .withColumn("notnull_unexpected_rate", col("notnull_unexpected")/lit(total))
    .show(false)

  // 2) Répartition des NULLs par statut annulé/détourné
  df.groupBy(col("CANCELLED"), col("DIVERTED"))
    .agg(
      count(lit(1)).as("rows"),
      sum(when(col("ARR_DELAY_NEW").isNull, 1).otherwise(0)).as("nulls")
    )
    .withColumn("null_rate", col("nulls")/col("rows"))
    .orderBy(desc("null_rate"))
    .show(false)

}

explainArrDelayNewNulls(flightData)

Total rows: 486133
+-----+---------+--------------------+
|nulls|non_nulls|null_rate           |
+-----+---------+--------------------+
|8108 |478025   |0.016678563273836582|
+-----+---------+--------------------+

+-------------+---------------+------------------+--------------------+--------------------+-----------------------+
|null_expected|null_unexpected|notnull_unexpected|null_expected_rate  |null_unexpected_rate|notnull_unexpected_rate|
+-------------+---------------+------------------+--------------------+--------------------+-----------------------+
|8108         |0              |0                 |0.016678563273836582|0.0                 |0.0                    |
+-------------+---------------+------------------+--------------------+--------------------+-----------------------+

+---------+--------+------+-----+---------+
|CANCELLED|DIVERTED|rows  |nulls|null_rate|
+---------+--------+------+-----+---------+
|0.0      |1.0     |1004  |1004 |1.0      |
|1.0      |0.0     |710

lastException = null


explainArrDelayNewNulls: (df: org.apache.spark.sql.DataFrame)Unit


null