* Author: Malik Chettih
* Affiliation: EMIASD - Executive Master Intelligence artificielle
& science des données
* Email: malik.chettih@dauphine.eu
* Formation Continue Univ. Paris Dauphine, January 2025.

# Flights - Analyse des données

## Part 1 - Pre-requis

### 1.1 Global Variables

In [1]:
val path = "../data/FLIGHT-3Y/Flights/"
val fileName = "201201.csv"
//val fileName = "2012*.csv"
//val fileName = "*.csv"

path = ../data/FLIGHT-3Y/Flights/
fileName = 201201.csv


201201.csv

### 1.2 Global parameters

In [2]:
//pipeline 
val _label = "label"
val _prefix = "indexed_"
val _featuresVec = "featuresVec"
val _featuresVecIndex = "features"

//metadata extraction
val _text = "textType"
val _numeric = "numericType"
val _date = "dateType"
val _other = "otherType"

_label = label
_prefix = indexed_
_featuresVec = featuresVec
_featuresVecIndex = features
_text = textType
_numeric = numericType
_date = dateType
_other = otherType


otherType

### 1.3 Global imports

In [3]:
import org.apache.spark.sql.functions._
import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, DecisionTreeClassifier}
import org.apache.spark.ml.evaluation._
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}

import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler, VectorIndexer}
import  org.apache.spark.ml.Pipeline 

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.col
//import spark.implicits._

### 1.4 Data Transformation Pipeline

In [4]:
def AutoPipeline(textCols: Array[String], numericCols: Array[String], target: String, maxCat: Int, handleInvalid: String):Pipeline = {
  //StringIndexer
  val inAttsNames = textCols ++ Array(target)
  val outAttsNames = inAttsNames.map(_prefix+_)

  val stringIndexer = new StringIndexer()
                              .setInputCols(inAttsNames)
                              .setOutputCols(outAttsNames)
                              .setHandleInvalid(handleInvalid)
  
  val features = outAttsNames.filterNot(_.contains(target))++numericCols
  
  //vectorAssembler
  val vectorAssembler = new VectorAssembler()
                            .setInputCols(features)
                            .setOutputCol(_featuresVec)
                            .setHandleInvalid(handleInvalid)
  
  //VectorIndexer
  val vectorIndexer = new VectorIndexer()
                            .setInputCol(_featuresVec)
                            .setOutputCol(_featuresVecIndex)
                            .setMaxCategories(maxCat)
                            .setHandleInvalid(handleInvalid)
  
  val pipeline = new Pipeline()
                    .setStages(Array(stringIndexer,vectorAssembler,vectorIndexer))
  
  return pipeline
}


AutoPipeline: (textCols: Array[String], numericCols: Array[String], target: String, maxCat: Int, handleInvalid: String)org.apache.spark.ml.Pipeline


%md
### 1.5 Data quality metrics collection

In [5]:
case class MetaData(name: String, origType: String, colType: String, compRatio: Float, nbDistinctValues: Long)

//considers only three types: numeric, textual and other
def whichType(origType: String) = origType match {
  case "StringType" => _text
  case "IntegerType"|"DoubleType" => _numeric
  case "DateType" => _date
  case _ => _other
}

def MDCompletenessDV(data: DataFrame): DataFrame = {
  val total_count = data.count()
  val res = data.dtypes.map{
    case(colName, colType)=>MetaData(colName, 
                                      colType, 
                                      whichType(colType),
                                      data.filter(col(colName).isNotNull).count.toFloat/total_count,
                                      data.select(colName).distinct().count)
  }.toList
  val metadata = res.toDS().toDF()
  metadata.persist()  
  metadata.count()
  return metadata
}

def SetMDColType(metaData: DataFrame, name: String, colType: String): DataFrame = {
  val metaData_updated = metaData.withColumn(
    "colType",
    when(col("name") === name, colType)
    .otherwise(col("colType"))
  )
  return metaData_updated
}

defined class MetaData


whichType: (origType: String)String
MDCompletenessDV: (data: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame
SetMDColType: (metaData: org.apache.spark.sql.DataFrame, name: String, colType: String)org.apache.spark.sql.DataFrame


## Part 2 - Data Loading

### 2.1 Chargement des données

In [6]:
import org.apache.spark.sql.SparkSession

val spark = SparkSession.builder()
  .config(sc.getConf)
  .getOrCreate()

val flights_original_data = spark.read.format("csv")
            .option("header", "true")
            .option("inferSchema", "true")
            .load(path+fileName)
            .persist()
flights_original_data.count()

spark = org.apache.spark.sql.SparkSession@16040fff
flights_original_data = [FL_DATE: date, OP_CARRIER_AIRLINE_ID: int ... 11 more fields]


486133

In [7]:
flights_original_data.printSchema

root
 |-- FL_DATE: date (nullable = true)
 |-- OP_CARRIER_AIRLINE_ID: integer (nullable = true)
 |-- OP_CARRIER_FL_NUM: integer (nullable = true)
 |-- ORIGIN_AIRPORT_ID: integer (nullable = true)
 |-- DEST_AIRPORT_ID: integer (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- ARR_DELAY_NEW: double (nullable = true)
 |-- CANCELLED: double (nullable = true)
 |-- DIVERTED: double (nullable = true)
 |-- CRS_ELAPSED_TIME: double (nullable = true)
 |-- WEATHER_DELAY: double (nullable = true)
 |-- NAS_DELAY: double (nullable = true)
 |-- _c12: string (nullable = true)



In [19]:
flights_original_data.show(10)

+----------+---------------------+-----------------+-----------------+---------------+------------+-------------+---------+--------+----------------+-------------+---------+----+
|   FL_DATE|OP_CARRIER_AIRLINE_ID|OP_CARRIER_FL_NUM|ORIGIN_AIRPORT_ID|DEST_AIRPORT_ID|CRS_DEP_TIME|ARR_DELAY_NEW|CANCELLED|DIVERTED|CRS_ELAPSED_TIME|WEATHER_DELAY|NAS_DELAY|_c12|
+----------+---------------------+-----------------+-----------------+---------------+------------+-------------+---------+--------+----------------+-------------+---------+----+
|2012-01-01|                20366|             4426|            15370|          12266|         845|          0.0|      0.0|     0.0|            99.0|         NULL|     NULL|NULL|
|2012-01-01|                20366|             4427|            12266|          15370|         858|          0.0|      0.0|     0.0|            88.0|         NULL|     NULL|NULL|
|2012-01-01|                20366|             4427|            15370|          12266|        1051|      

### 2.2 Collecting data quality metrics

In [22]:
var flights_original_metadata = MDCompletenessDV(flights_original_data)
flights_original_metadata.orderBy($"name".asc).show()

+--------------------+-----------+-----------+----------+----------------+
|                name|   origType|    colType| compRatio|nbDistinctValues|
+--------------------+-----------+-----------+----------+----------------+
|       ARR_DELAY_NEW| DoubleType|numericType| 0.9833214|             549|
|           CANCELLED| DoubleType|numericType|       1.0|               2|
|        CRS_DEP_TIME|IntegerType|numericType|       1.0|            1153|
|    CRS_ELAPSED_TIME| DoubleType|numericType|       1.0|             419|
|     DEST_AIRPORT_ID|IntegerType|numericType|       1.0|             287|
|            DIVERTED| DoubleType|numericType|       1.0|               2|
|             FL_DATE|   DateType|   dateType|       1.0|              31|
|           NAS_DELAY| DoubleType|numericType|0.14586131|             283|
|OP_CARRIER_AIRLIN...|IntegerType|numericType|       1.0|              15|
|   OP_CARRIER_FL_NUM|IntegerType|numericType|       1.0|            6237|
|   ORIGIN_AIRPORT_ID|Int

flights_original_metadata = [name: string, origType: string ... 3 more fields]


[name: string, origType: string ... 3 more fields]

In [20]:
var flights_original_metadata = MDCompletenessDV(flights_original_data)
flights_original_metadata.orderBy($"compRatio".desc).show()

flights_original_metadata = [name: string, origType: string ... 3 more fields]


+--------------------+-----------+-----------+----------+----------------+
|                name|   origType|    colType| compRatio|nbDistinctValues|
+--------------------+-----------+-----------+----------+----------------+
|                _c12| StringType|   textType|       0.0|               1|
|       WEATHER_DELAY| DoubleType|numericType|0.14586131|             288|
|   ORIGIN_AIRPORT_ID|IntegerType|numericType|       1.0|             287|
|   OP_CARRIER_FL_NUM|IntegerType|numericType|       1.0|            6237|
|OP_CARRIER_AIRLIN...|IntegerType|numericType|       1.0|              15|
|           NAS_DELAY| DoubleType|numericType|0.14586131|             283|
|             FL_DATE|   DateType|   dateType|       1.0|              31|
|            DIVERTED| DoubleType|numericType|       1.0|               2|
|     DEST_AIRPORT_ID|IntegerType|numericType|       1.0|             287|
|    CRS_ELAPSED_TIME| DoubleType|numericType|       1.0|             419|
|        CRS_DEP_TIME|Int

[name: string, origType: string ... 3 more fields]

### 2.4 Data Description


| Colonne                   | Description                                                                 | Type           | Utilité en ML                  |
|---------------------------|-----------------------------------------------------------------------------|----------------|--------------------------------|
| `FL_DATE`                | Date du vol (format `YYYY-MM-DD`)                                           | `Date`         | ✅ Pour extraire jour/semaine  |
| `OP_CARRIER_AIRLINE_ID` | ID numérique de la compagnie aérienne                                        | `Int`          | ✅ Variable catégorielle       |
| `OP_CARRIER_FL_NUM`     | Numéro de vol dans la compagnie                                              | `String`       | 🔶 Optionnel (peut être bruit) |
| `ORIGIN_AIRPORT_ID`     | ID de l’aéroport d’origine                                                   | `Int`          | ✅ Variable catégorielle       |
| `DEST_AIRPORT_ID`       | ID de l’aéroport de destination                                              | `Int`          | ✅ Variable catégorielle       |
| `CRS_DEP_TIME`          | Heure de départ prévue (`hhmm`, ex: "0845")                                  | `String`/`Int` | ✅ Extraire tranche horaire    |
| `ARR_DELAY_NEW`         | Retard à l’arrivée (≥ 0)                                                     | `Float`        | 🎯 **Label cible**             |
| `CANCELLED`             | Indique si le vol est annulé (1 = oui)                                       | `Float` (0/1)  | ❌ À filtrer (inutile en ML)   |
| `DIVERTED`              | Indique si le vol est détourné (1 = oui)                                     | `Float` (0/1)  | ❌ À filtrer (inutile en ML)   |
| `CRS_ELAPSED_TIME`      | Durée de vol planifiée en minutes                                            | `Float`        | ✅ Variable continue           |
| `WEATHER_DELAY`         | Retard dû à la météo (en minutes)                                            | `Float`        | 🔶 Optionnel pour post-analyse |
| `NAS_DELAY`             | Retard dû au système de navigation aérienne (ATC, météo modérée, congestion) | `Float`        | 🔶 Optionnel pour analyse cause|


In [28]:
flights_original_data.describe().show()

+-------+---------------------+------------------+------------------+------------------+------------------+------------------+-------------------+--------------------+-----------------+------------------+------------------+----+
|summary|OP_CARRIER_AIRLINE_ID| OP_CARRIER_FL_NUM| ORIGIN_AIRPORT_ID|   DEST_AIRPORT_ID|      CRS_DEP_TIME|     ARR_DELAY_NEW|          CANCELLED|            DIVERTED| CRS_ELAPSED_TIME|     WEATHER_DELAY|         NAS_DELAY|_c12|
+-------+---------------------+------------------+------------------+------------------+------------------+------------------+-------------------+--------------------+-----------------+------------------+------------------+----+
|  count|              4142522|           4142522|           4142522|           4142522|           4142521|           4080958|            4142522|             4142522|          4142518|            758808|            758808|   0|
|   mean|   20028.678230073372|2309.6108691758304|12658.057500479177|12657.986524392

## Part 3 - Feature Engineering

### 3.1 Data Cleaning

#### 3.1.1 Drop Duplicates

In [23]:
println("Total records count without dropping duplicates:", flights_original_data.count())
val flights_data = flights_original_data.dropDuplicates()
println("Total records count after dropping duplicates:", flights_data.count())

(Total records count without dropping duplicates:,486133)(Total records count after dropping duplicates:,486133)

flights_data = [FL_DATE: date, OP_CARRIER_AIRLINE_ID: int ... 11 more fields]


[FL_DATE: date, OP_CARRIER_AIRLINE_ID: int ... 11 more fields]