# Weather Dataset Preprocessing and feature Engineering

In [4]:
%AddJar file:///home/jovyan/work/apps/Emiasd-Flight-Data-Analysis.jar

Starting download from file:///home/jovyan/work/apps/Emiasd-Flight-Data-Analysis.jar
Finished download of Emiasd-Flight-Data-Analysis.jar
Using cached version of Emiasd-Flight-Data-Analysis.jar


In [5]:
import org.apache.spark.sql.SparkSession
import com.flightdelay.config.{AppConfiguration, ConfigurationLoader}
import com.flightdelay.data.loaders.FlightDataLoader

//Env Configuration
val args: Array[String] = Array("jupyter")

val spark = SparkSession.builder()
  .config(sc.getConf)
  .getOrCreate()

// Rendre la session Spark implicite
implicit val session = spark
implicit val configuration: AppConfiguration = ConfigurationLoader.loadConfiguration(args)

val processedWeatherPath = s"${configuration.common.output.basePath}/common/data/processed_weather.parquet"
val weatherDF = spark.read.parquet(processedWeatherPath)

args = Array(jupyter)
spark = org.apache.spark.sql.SparkSession@40a477c3
session = org.apache.spark.sql.SparkSession@40a477c3
configuration = AppConfiguration(local,CommonConfig(42,DataConfig(/home/jovyan/work/data,FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Flights/*.csv),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Weather/*.txt),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/wban_airport_timezone.csv)),OutputConfig(/home/jovyan/work/output,FileConfig(/home/jovyan/work/output/data),FileConfig(/home/jovya...


AppConfiguration(local,CommonConfig(42,DataConfig(/home/jovyan/work/data,FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Flights/*.csv),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Weather/*.txt),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/wban_airport_timezone.csv)),OutputConfig(/home/jovyan/work/output,FileConfig(/home/jovyan/work/output/data),FileConfig(/home/jovya...

In [6]:
weatherDF.count()

11236398

In [7]:
weatherDF.printSchema

root
 |-- WBAN: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- StationType: integer (nullable = true)
 |-- SkyCondition: string (nullable = true)
 |-- SkyConditionFlag: string (nullable = true)
 |-- Visibility: double (nullable = true)
 |-- VisibilityFlag: string (nullable = true)
 |-- WeatherType: string (nullable = true)
 |-- WeatherTypeFlag: string (nullable = true)
 |-- DryBulbFarenheit: double (nullable = true)
 |-- DryBulbFarenheitFlag: string (nullable = true)
 |-- DryBulbCelsius: double (nullable = true)
 |-- DryBulbCelsiusFlag: string (nullable = true)
 |-- WetBulbFarenheit: double (nullable = true)
 |-- WetBulbFarenheitFlag: string (nullable = true)
 |-- WetBulbCelsius: double (nullable = true)
 |-- WetBulbCelsiusFlag: string (nullable = true)
 |-- DewPointFarenheit: double (nullable = true)
 |-- DewPointFarenheitFlag: string (nullable = true)
 |-- DewPointCelsius: double (nullable = true)
 |-- DewPointCelsiusFlag: string (nullable = true)
 |-- RelativeHumi

## Station Type

In [5]:
weatherDF
  .select("StationType")
  .distinct()
  .show(false)

+-----------+
|StationType|
+-----------+
|12         |
|6          |
|5          |
|15         |
|9          |
|4          |
|11         |
|0          |
+-----------+



## SkyConditionFeatures

In [12]:
var addedFeatureColumns = Seq(
"feature_most_critical_sky",
"feature_num_cloud_layers",
"feature_cloud_risk_score",
"feature_has_overcast",
"feature_has_broken",
"feature_has_obscured",
"feature_is_clear",
"feature_lowest_cloud_height",
"feature_ceiling",
"feature_has_low_ceiling"
)

var df = weatherDF
  .select(addedFeatureColumns.head, addedFeatureColumns.tail: _*)
df.show(1, 1000, true)

addedFeatureColumns = List(feature_most_critical_sky, feature_num_cloud_layers, feature_cloud_risk_score, feature_has_overcast, feature_has_broken, feature_has_obscured, feature_is_clear, feature_lowest_cloud_height, feature_ceiling, feature_has_low_ceiling)
df = [feature_most_critical_sky: string, feature_num_cloud_layers: int ... 8 more fields]


-RECORD 0----------------------------
 feature_most_critical_sky   | OVC   
 feature_num_cloud_layers    | 1     
 feature_cloud_risk_score    | 4.0   
 feature_has_overcast        | true  
 feature_has_broken          | false 
 feature_has_obscured        | false 
 feature_is_clear            | false 
 feature_lowest_cloud_height | 1300  
 feature_ceiling             | 1300  
 feature_has_low_ceiling     | 0     
only showing top 1 row



[feature_most_critical_sky: string, feature_num_cloud_layers: int ... 8 more fields]

## VisibilityFeatures

In [13]:
addedFeatureColumns = Seq(
"feature_visibility_miles",
"feature_visibility_km",
"feature_visibility_category",
"feature_visibility_risk_score",
"feature_is_low_visibility",
"feature_is_very_low_visibility",
"feature_visibility_normalized",
"feature_visibility_inverse"
)

val df = weatherDF
  .select(addedFeatureColumns.head, addedFeatureColumns.tail: _*)
df.show(1, 1000, true)

addedFeatureColumns = List(feature_visibility_miles, feature_visibility_km, feature_visibility_category, feature_visibility_risk_score, feature_is_low_visibility, feature_is_very_low_visibility, feature_visibility_normalized, feature_visibility_inverse)
df = [feature_visibility_miles: double, feature_visibility_km: double ... 6 more fields]


-RECORD 0-------------------------------
 feature_visibility_miles       | 1.0   
 feature_visibility_km          | 1.609 
 feature_visibility_category    | IFR   
 feature_visibility_risk_score  | 3.0   
 feature_is_low_visibility      | 1     
 feature_is_very_low_visibility | 0     
 feature_visibility_normalized  | 0.1   
 feature_visibility_inverse     | 1.0   
only showing top 1 row



[feature_visibility_miles: double, feature_visibility_km: double ... 6 more fields]

In [14]:
// Obtenir les valeurs distinctes
var distinctValues = df.select("feature_visibility_category")
  .distinct()
  .orderBy("feature_visibility_category")

// Afficher les résultats
distinctValues.show()

distinctValues = [feature_visibility_category: string]


+---------------------------+
|feature_visibility_category|
+---------------------------+
|                        IFR|
|                    IFR_LOW|
|                       LIFR|
|                       MVFR|
|                        VFR|
|                   VFR_HIGH|
+---------------------------+



[feature_visibility_category: string]

## WeatherInteractionFeatures

In [8]:
addedFeatureColumns = Seq(
"feature_weather_severity_index",
"feature_is_vfr_conditions",
"feature_is_ifr_conditions",
"feature_requires_cat_ii",
"feature_operations_risk_level"
)

val df = weatherDF
  .select(addedFeatureColumns.head, addedFeatureColumns.tail: _*)
df.show(1, 1000, true)

addedFeatureColumns = List(feature_weather_severity_index, feature_is_vfr_conditions, feature_is_ifr_conditions, feature_requires_cat_ii, feature_operations_risk_level)
df = [feature_weather_severity_index: double, feature_is_vfr_conditions: int ... 3 more fields]


-RECORD 0-----------------------------
 feature_weather_severity_index | 3.4 
 feature_is_vfr_conditions      | 0   
 feature_is_ifr_conditions      | 1   
 feature_requires_cat_ii        | 0   
 feature_operations_risk_level  | 2   
only showing top 1 row



[feature_weather_severity_index: double, feature_is_vfr_conditions: int ... 3 more fields]

## WeatherTypeFeatureGenerator

In [15]:
addedFeatureColumns = Seq(
"intensity_heavy",
"intensity_light",
"weather_intensity",
"has_thunderstorm",
"has_freezing_precip",
"has_freezing",
"has_precipitation",
"has_obscuration",
"has_visible_moisture",
"has_hazardous",
"has_rain",
"has_snow",
"has_hail",
"extracted_codes",
"weather_hazard_level",
"Icing_Risk_Flag",
"Icing_Risk_Level"
)

val df = weatherDF
  .select(addedFeatureColumns.head, addedFeatureColumns.tail: _*)
df.show(1, 1000, true)

addedFeatureColumns = List(intensity_heavy, intensity_light, weather_intensity, has_thunderstorm, has_freezing_precip, has_freezing, has_precipitation, has_obscuration, has_visible_moisture, has_hazardous, has_rain, has_snow, has_hail, extracted_codes, weather_hazard_level, Icing_Risk_Flag, Icing_Risk_Level)
df = [intensity_heavy: int, intensity_light: int ... 15 more fields]


-RECORD 0------------------------
 intensity_heavy      | 0        
 intensity_light      | 0        
 weather_intensity    | moderate 
 has_thunderstorm     | 0        
 has_freezing_precip  | 0        
 has_freezing         | 0        
 has_precipitation    | 0        
 has_obscuration      | 0        
 has_visible_moisture | 0        
 has_hazardous        | 0        
 has_rain             | 0        
 has_snow             | 0        
 has_hail             | 0        
 extracted_codes      |          
 weather_hazard_level | 0        
 Icing_Risk_Flag      | 0        
 Icing_Risk_Level     | 0        
only showing top 1 row



[intensity_heavy: int, intensity_light: int ... 15 more fields]

In [10]:
// Obtenir les valeurs distinctes
val distinctValues = df.select("weather_intensity")
  .distinct()
  .orderBy("weather_intensity")

// Afficher les résultats
distinctValues.show()

distinctValues = [weather_intensity: string]


+-----------------+
|weather_intensity|
+-----------------+
|            heavy|
|            light|
|         moderate|
+-----------------+



[weather_intensity: string]

In [11]:
import org.apache.spark.sql.functions.col

// Filtrer sur weather_intensity == "heavy"
val heavyWeatherDF = df.filter(col("weather_intensity") === "heavy")

// Afficher le résultat
heavyWeatherDF.show(1, 10000, true)

heavyWeatherDF = [intensity_heavy: int, intensity_light: int ... 15 more fields]


-RECORD 0---------------------
 intensity_heavy      | 1     
 intensity_light      | 0     
 weather_intensity    | heavy 
 has_thunderstorm     | 0     
 has_freezing_precip  | 0     
 has_freezing         | 0     
 has_precipitation    | 1     
 has_obscuration      | 1     
 has_visible_moisture | 1     
 has_hazardous        | 0     
 has_rain             | 1     
 has_snow             | 0     
 has_hail             | 0     
 extracted_codes      | RA BR 
 weather_hazard_level | 2     
 Icing_Risk_Flag      | 0     
 Icing_Risk_Level     | 0     
only showing top 1 row



[intensity_heavy: int, intensity_light: int ... 15 more fields]

## Original DF

In [29]:
var featureColumns = Seq(
"DryBulbCelsius",
"DryBulbCelsiusFlag",
"DryBulbFarenheit",
"DryBulbFarenheitFlag"
)

val df = weatherDF
  .select(featureColumns.head, featureColumns.tail: _*)
df.show(1, 1000, true)

-RECORD 0--------------------
 DryBulbCelsius       | -1.0 
 DryBulbCelsiusFlag   |      
 DryBulbFarenheit     | 30.0 
 DryBulbFarenheitFlag |      
only showing top 1 row



featureColumns = List(DryBulbCelsius, DryBulbCelsiusFlag, DryBulbFarenheit, DryBulbFarenheitFlag)
df = [DryBulbCelsius: double, DryBulbCelsiusFlag: string ... 2 more fields]


[DryBulbCelsius: double, DryBulbCelsiusFlag: string ... 2 more fields]

In [30]:
featureColumns = Seq(
"DewPointCelsius",
"DewPointCelsiusFlag",
"DewPointFarenheit",
"DewPointFarenheitFlag"
)

val df = weatherDF
  .select(featureColumns.head, featureColumns.tail: _*)
df.show(1, 1000, true)

featureColumns = List(DewPointCelsius, DewPointCelsiusFlag, DewPointFarenheit, DewPointFarenheitFlag)
df = [DewPointCelsius: double, DewPointCelsiusFlag: string ... 2 more fields]


-RECORD 0----------------------
 DewPointCelsius       | -12.0 
 DewPointCelsiusFlag   |       
 DewPointFarenheit     | 10.0  
 DewPointFarenheitFlag |       
only showing top 1 row



[DewPointCelsius: double, DewPointCelsiusFlag: string ... 2 more fields]

In [31]:
featureColumns = Seq(
"WetBulbCelsius",
"WetBulbCelsiusFlag",
"WetBulbFarenheit",
"WetBulbFarenheitFlag"
)

val df = weatherDF
  .select(featureColumns.head, featureColumns.tail: _*)
df.show(1, 1000, true)

-RECORD 0--------------------
 WetBulbCelsius       | -4.9 
 WetBulbCelsiusFlag   |      
 WetBulbFarenheit     | 23.0 
 WetBulbFarenheitFlag |      
only showing top 1 row



featureColumns = List(WetBulbCelsius, WetBulbCelsiusFlag, WetBulbFarenheit, WetBulbFarenheitFlag)
df = [WetBulbCelsius: double, WetBulbCelsiusFlag: string ... 2 more fields]


[WetBulbCelsius: double, WetBulbCelsiusFlag: string ... 2 more fields]

In [32]:
featureColumns = Seq(
"RelativeHumidity",
"RelativeHumidityFlag"
)

val df = weatherDF
  .select(featureColumns.head, featureColumns.tail: _*)
df.show(1, 1000, true)

-RECORD 0--------------------
 RelativeHumidity     | 43.0 
 RelativeHumidityFlag |      
only showing top 1 row



featureColumns = List(RelativeHumidity, RelativeHumidityFlag)
df = [RelativeHumidity: double, RelativeHumidityFlag: string]


[RelativeHumidity: double, RelativeHumidityFlag: string]

In [33]:
featureColumns = Seq(
"HourlyPrecip",
"HourlyPrecipFlag"
)

val df = weatherDF
  .select(featureColumns.head, featureColumns.tail: _*)
df.show(1, 1000, true)

featureColumns = List(HourlyPrecip, HourlyPrecipFlag)
df = [HourlyPrecip: string, HourlyPrecipFlag: string]


-RECORD 0---------------
 HourlyPrecip     |     
 HourlyPrecipFlag |     
only showing top 1 row



[HourlyPrecip: string, HourlyPrecipFlag: string]

In [34]:
featureColumns = Seq(
"WindSpeed",
"WindSpeedFlag",
"WindDirection",
"WindDirectionFlag",
"ValueForWindCharacter",
"ValueForWindCharacterFlag"
)

val df = weatherDF
  .select(featureColumns.head, featureColumns.tail: _*)
df.show(1, 1000, true)

-RECORD 0--------------------------
 WindSpeed                 | 5.0   
 WindSpeedFlag             |       
 WindDirection             | 120.0 
 WindDirectionFlag         |       
 ValueForWindCharacter     |       
 ValueForWindCharacterFlag |       
only showing top 1 row



featureColumns = List(WindSpeed, WindSpeedFlag, WindDirection, WindDirectionFlag, ValueForWindCharacter, ValueForWindCharacterFlag)
df = [WindSpeed: double, WindSpeedFlag: string ... 4 more fields]


[WindSpeed: double, WindSpeedFlag: string ... 4 more fields]

In [9]:
val featureColumns = Seq(
    "SkyCondition",
    "Visibility",
    "WeatherType"
)

val df = weatherDF
  .select(featureColumns.head, featureColumns.tail: _*)
df.show(100, 1000, false)

featureColumns = List(SkyCondition, Visibility, WeatherType)
df = [SkyCondition: string, Visibility: double ... 1 more field]


+--------------------+----------+-----------+
|        SkyCondition|Visibility|WeatherType|
+--------------------+----------+-----------+
|              OVC013|      10.0|           |
|                 CLR|      10.0|           |
|                   M|      NULL|           |
|FEW047 BKN080 OVC110|      10.0|           |
|FEW028 SCT034 BKN042|      10.0|           |
|                   M|      NULL|           |
|              OVC120|      10.0|           |
|              OVC016|      10.0|           |
|                 CLR|      10.0|           |
|               VV002|      0.75|     -RA BR|
|                   M|      NULL|           |
|       FEW025 OVC050|      10.0|           |
|       BKN016 OVC060|       6.0|         BR|
|                   M|      NULL|           |
|       BKN020 OVC027|      10.0|           |
|              OVC018|      10.0|           |
|              OVC042|       5.0|         BR|
|                   M|      NULL|           |
|SCT017 SCT023 OVC049|       3.0| 

[SkyCondition: string, Visibility: double ... 1 more field]

In [11]:
val featureColumns = Seq(
"WindSpeed",
"WindSpeedFlag",
"WindDirection",
"WindDirectionFlag",
"ValueForWindCharacter",
"ValueForWindCharacterFlag"
)

val df = weatherDF
  .select(featureColumns.head, featureColumns.tail: _*)
df.show(100)

featureColumns = List(WindSpeed, WindSpeedFlag, WindDirection, WindDirectionFlag, ValueForWindCharacter, ValueForWindCharacterFlag)
df = [WindSpeed: double, WindSpeedFlag: string ... 4 more fields]


+---------+-------------+-------------+-----------------+---------------------+-------------------------+
|WindSpeed|WindSpeedFlag|WindDirection|WindDirectionFlag|ValueForWindCharacter|ValueForWindCharacterFlag|
+---------+-------------+-------------+-----------------+---------------------+-------------------------+
|     21.0|             |        360.0|                 |                   30|                         |
|      5.0|             |        200.0|                 |                     |                         |
|     NULL|             |         NULL|                 |                     |                         |
|      3.0|             |        260.0|                 |                     |                         |
|      3.0|             |        230.0|                 |                     |                         |
|      2.0|             |         NULL|                 |                     |                         |
|      0.0|             |          0.0|       

[WindSpeed: double, WindSpeedFlag: string ... 4 more fields]

In [14]:
val featureColumns = Seq(
"RelativeHumidity",
"StationPressure",
"PressureTendency",
"PressureChange",
"SeaLevelPressure",
"RecordType",
"HourlyPrecip",
"Altimeter"
)

val df = weatherDF
  .select(featureColumns.head, featureColumns.tail: _*)
df.show(100)

featureColumns = List(RelativeHumidity, StationPressure, PressureTendency, PressureChange, SeaLevelPressure, RecordType, HourlyPrecip, Altimeter)
df = [RelativeHumidity: double, StationPressure: double ... 6 more fields]


+----------------+---------------+----------------+--------------+----------------+----------+------------+---------+
|RelativeHumidity|StationPressure|PressureTendency|PressureChange|SeaLevelPressure|RecordType|HourlyPrecip|Altimeter|
+----------------+---------------+----------------+--------------+----------------+----------+------------+---------+
|            77.0|          25.86|                |          NULL|           30.25|        AA|            |    30.18|
|            63.0|          26.91|                |          NULL|           29.93|        AA|            |    30.03|
|            NULL|           NULL|                |          NULL|               M|     CRN05|            |     NULL|
|            80.0|          25.58|                |          NULL|           29.90|        AA|           T|    29.97|
|            93.0|          29.32|                |          NULL|               M|        SP|            |    29.88|
|            NULL|           NULL|                |     

[RelativeHumidity: double, StationPressure: double ... 6 more fields]

In [16]:
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

def summarizeMAndNumeric(df: DataFrame)(implicit spark: SparkSession): DataFrame = {
  val numericRegex = "^[+-]?(\\d+\\.?\\d*|\\.\\d+)([eE][+-]?\\d+)?$"

  // 1) Agrégations en un seul scan
  val aggExprs = df.columns.flatMap { c =>
    val s = col(c).cast("string")
    Seq(
      sum( when(s === "M", 1).otherwise(0) ).alias(s"${c}__m_count"),
      // valeurs non nulles, ≠ "M", non-numériques
      sum( when(s.isNotNull && s =!= "M" && !s.rlike(numericRegex), 1).otherwise(0) )
        .alias(s"${c}__non_numeric_other_count"),
      sum( when(s.isNotNull, 1).otherwise(0) ).alias(s"${c}__nonnull_count")
    )
  }

  val aggRow = df.agg(aggExprs.head, aggExprs.tail: _*).first()

  // 2) Construire un petit DF récapitulatif côté driver (très petit)
  val rows: Seq[Row] = df.columns.map { c =>
    val mCount   = aggRow.getAs[Number](s"${c}__m_count").longValue()
    val badCount = aggRow.getAs[Number](s"${c}__non_numeric_other_count").longValue()
    val nonNull  = aggRow.getAs[Number](s"${c}__nonnull_count").longValue()
    val ok       = (mCount > 0) && (badCount == 0L)
    Row(c, mCount, badCount, nonNull, ok)
  }

  val schema = StructType(Seq(
    StructField("column", StringType, false),
    StructField("m_count", LongType, false),
    StructField("non_numeric_other_count", LongType, false),
    StructField("nonnull_count", LongType, false),
    StructField("ok_pattern", BooleanType, false) // true => contient "M" et le reste est NULL/numérique
  ))

  spark.createDataFrame(spark.sparkContext.parallelize(rows), schema)
}

val summaryDF = summarizeMAndNumeric(weatherDF)
summaryDF.orderBy(desc("ok_pattern"), desc("m_count")).show(truncate = false)

+---------------------------+-------+-----------------------+-------------+----------+
|column                     |m_count|non_numeric_other_count|nonnull_count|ok_pattern|
+---------------------------+-------+-----------------------+-------------+----------+
|SeaLevelPressure           |5888816|0                      |11236398     |true      |
|SkyCondition               |1481170|9755228                |11236398     |false     |
|RecordType                 |0      |11236398               |11236398     |false     |
|feature_most_critical_sky  |0      |11236398               |11236398     |false     |
|RecordTypeFlag             |0      |11236398               |11236398     |false     |
|feature_is_clear           |0      |11236398               |11236398     |false     |
|HourlyPrecip               |0      |10907635               |11236398     |false     |
|feature_num_cloud_layers   |0      |0                      |11236398     |false     |
|HourlyPrecipFlag           |0      |112363

summaryDF = [column: string, m_count: bigint ... 3 more fields]


summarizeMAndNumeric: (df: org.apache.spark.sql.DataFrame)(implicit spark: org.apache.spark.sql.SparkSession)org.apache.spark.sql.DataFrame


[column: string, m_count: bigint ... 3 more fields]