# Flight Preprocessing - Cleaning

In [1]:
%AddJar file:///home/jovyan/work/apps/Emiasd-Flight-Data-Analysis.jar

Starting download from file:///home/jovyan/work/apps/Emiasd-Flight-Data-Analysis.jar
Finished download of Emiasd-Flight-Data-Analysis.jar
Using cached version of Emiasd-Flight-Data-Analysis.jar


In [2]:
import org.apache.spark.sql.SparkSession
import com.flightdelay.config.{AppConfiguration, ConfigurationLoader}
import com.flightdelay.data.loaders.FlightDataLoader

//Env Configuration
val args: Array[String] = Array("local")

val spark = SparkSession.builder()
  .config(sc.getConf)
  .getOrCreate()

// Rendre la session Spark implicite
implicit val session = spark
implicit val configuration: AppConfiguration = ConfigurationLoader.loadConfiguration(args)

// Cellule 4: Test
val flightFilePath = "../../data/FLIGHT-3Y/Flights/201201.csv"
val flightData = FlightDataLoader.loadFromFilePath(flightFilePath)



----------------------------------------------------------------------------------------------------------
--> [FlightDataLoader] Flight Data Loading - Start ...
--> 486133 loaded ...
root
 |-- FL_DATE: date (nullable = true)
 |-- OP_CARRIER_AIRLINE_ID: integer (nullable = true)
 |-- OP_CARRIER_FL_NUM: integer (nullable = true)
 |-- ORIGIN_AIRPORT_ID: integer (nullable = true)
 |-- DEST_AIRPORT_ID: integer (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- ARR_DELAY_NEW: double (nullable = true)
 |-- CANCELLED: double (nullable = true)
 |-- DIVERTED: double (nullable = true)
 |-- CRS_ELAPSED_TIME: double (nullable = true)
 |-- WEATHER_DELAY: double (nullable = true)
 |-- NAS_DELAY: double (nullable = true)

+----------+---------------------+-----------------+-----------------+---------------+------------+-------------+---------+--------+----------------+-------------+---------+
|   FL_DATE|OP_CARRIER_AIRLINE_ID|OP_CARRIER_FL_NUM|ORIGIN_AIRPORT_ID|DEST_AIRPORT_ID|CRS_

args = Array(local)
spark = org.apache.spark.sql.SparkSession@345dd9ce
session = org.apache.spark.sql.SparkSession@345dd9ce
configuration = AppConfiguration(local,DataConfig(/data,FileConfig(/data/FLIGHT-3Y/Flights/201201.csv),FileConfig(/data/FLIGHT-3Y/Weather/201201hourly.txt),FileConfig(/data/FLIGHT-3Y/wban_airport_timezone.csv)),OutputConfig(/output,FileConfig(/output/data),FileConfig(/output/model)))
flightFilePath = ../../data/FLIGHT-3Y/Flights/201201.csv


flightData: org.apache.spark.sql....


../../data/FLIGHT-3Y/Flights/201201.csv

In [3]:
import com.flightdelay.data.utils.DataQualityMetrics

val flightDataMetrics = DataQualityMetrics.metrics(flightData)
flightDataMetrics.show()

+--------------------+-----------+-------+----------+----------------+
|                name|   origType|colType| compRatio|nbDistinctValues|
+--------------------+-----------+-------+----------+----------------+
|             FL_DATE|   DateType|   date|       1.0|              31|
|OP_CARRIER_AIRLIN...|IntegerType|numeric|       1.0|              15|
|   OP_CARRIER_FL_NUM|IntegerType|numeric|       1.0|            6237|
|   ORIGIN_AIRPORT_ID|IntegerType|numeric|       1.0|             287|
|     DEST_AIRPORT_ID|IntegerType|numeric|       1.0|             287|
|        CRS_DEP_TIME|IntegerType|numeric|       1.0|            1153|
|       ARR_DELAY_NEW| DoubleType|numeric| 0.9833214|             549|
|           CANCELLED| DoubleType|numeric|       1.0|               2|
|            DIVERTED| DoubleType|numeric|       1.0|               2|
|    CRS_ELAPSED_TIME| DoubleType|numeric|       1.0|             419|
|       WEATHER_DELAY| DoubleType|numeric|0.14586131|             288|
|     

flightDataMetrics = [name: string, origType: string ... 3 more fields]


[name: string, origType: string ... 3 more fields]

In [4]:
import com.flightdelay.data.preprocessing.FlightDataCleaner

val flightCleanedData = FlightDataCleaner.preprocess(flightData)



----------------------------------------------------------------------------------------------------------
--> [FlightDataCleaner] Flight Data Cleaner - Start ...
----------------------------------------------------------------------------------------------------------
Data Original Count: 486133

Phase 1: Basic Cleaning - Remove Duplicates
Current Count : 486133

Phase 2: Filter Flights
- Filter Cancelled and Diverted Flights
Suppression des valeurs spécifiques: Map(CANCELLED -> List(1.0), DIVERTED -> List(1.0))
Nombre de lignes avant: 486133
Nombre de lignes après suppression des valeurs spécifiques: 478025
- Filter Invalid departure time
- Filter Invalid airports
Current Count : 478025

Phase 3: Types Conversion
Conversion des types de données: NAS_DELAY, OP_CARRIER_AIRLINE_ID, OP_CARRIER_FL_NUM, WEATHER_DELAY, DEST_AIRPORT_ID, ORIGIN_AIRPORT_ID, CRS_ELAPSED_TIME, FL_DATE, CRS_DEP_TIME, ARR_DELAY_NEW
- Filter Invalid flight date formats
Current Count : 478025

Phase 4: Filter Outl

flightCleanedData = [FL_DATE: string, OP_CARRIER_AIRLINE_ID: int ... 8 more fields]


[FL_DATE: string, OP_CARRIER_AIRLINE_ID: int ... 8 more fields]

In [5]:
import com.flightdelay.data.utils.DataQualityMetrics

val flightCleanedDataMetrics = DataQualityMetrics.metrics(flightCleanedData)
flightCleanedDataMetrics.show()

+--------------------+-----------+-------+----------+----------------+
|                name|   origType|colType| compRatio|nbDistinctValues|
+--------------------+-----------+-------+----------+----------------+
|             FL_DATE| StringType|textual|       1.0|              31|
|OP_CARRIER_AIRLIN...|IntegerType|numeric|       1.0|              15|
|   OP_CARRIER_FL_NUM| StringType|textual|       1.0|            6237|
|   ORIGIN_AIRPORT_ID|IntegerType|numeric|       1.0|             287|
|     DEST_AIRPORT_ID|IntegerType|numeric|       1.0|             287|
|        CRS_DEP_TIME|IntegerType|numeric|       1.0|            1153|
|       ARR_DELAY_NEW| DoubleType|numeric|       1.0|             486|
|    CRS_ELAPSED_TIME| DoubleType|numeric|       1.0|             418|
|       WEATHER_DELAY| DoubleType|numeric|0.14821951|             278|
|           NAS_DELAY| DoubleType|numeric|0.14821951|             282|
+--------------------+-----------+-------+----------+----------------+



flightCleanedDataMetrics = [name: string, origType: string ... 3 more fields]


[name: string, origType: string ... 3 more fields]