# Weather Feature Engineering

In [1]:
%AddJar file:///home/jovyan/work/apps/Emiasd-Flight-Data-Analysis.jar

Starting download from file:///home/jovyan/work/apps/Emiasd-Flight-Data-Analysis.jar
Finished download of Emiasd-Flight-Data-Analysis.jar
Using cached version of Emiasd-Flight-Data-Analysis.jar


In [2]:
import org.apache.spark.sql.SparkSession
import com.flightdelay.config.{AppConfiguration, ConfigurationLoader}
import com.flightdelay.data.loaders.FlightDataLoader

//Env Configuration
val args: Array[String] = Array("jupyter")

val spark = SparkSession.builder()
  .config(sc.getConf)
  .getOrCreate()

// Rendre la session Spark implicite
implicit val session = spark
implicit val configuration: AppConfiguration = ConfigurationLoader.loadConfiguration(args)

args = Array(jupyter)
spark = org.apache.spark.sql.SparkSession@4856b919
session = org.apache.spark.sql.SparkSession@4856b919
configuration = AppConfiguration(local,CommonConfig(42,DataConfig(/home/jovyan/work/data,FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Flights/*.csv),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Weather/*.txt),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/wban_airport_timezone.csv)),OutputConfig(/home/jovyan/work/output,FileConfig(/home/jovyan/work/output/data),FileConfig(/home/jovya...


AppConfiguration(local,CommonConfig(42,DataConfig(/home/jovyan/work/data,FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Flights/*.csv),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/Weather/*.txt),FileConfig(/home/jovyan/work/data/FLIGHT-3Y/wban_airport_timezone.csv)),OutputConfig(/home/jovyan/work/output,FileConfig(/home/jovyan/work/output/data),FileConfig(/home/jovya...

# Load Data

In [5]:
import com.flightdelay.data.loaders.WeatherDataLoader
WeatherDataLoader.loadFromConfiguration()
val rawWeatherPath = s"${configuration.common.output.basePath}/common/data/raw_weather.parquet"

val weatherDF = spark.read.parquet(rawWeatherPath)


[STEP 1][DataLoader] Weather Data Loading - Start

Loading from CSV file:
  - Path: /home/jovyan/work/data/FLIGHT-3Y/Weather/*.txt
  - Loaded 32631312 records from CSV

Saving to Parquet format:
  - Path: /home/jovyan/work/output/common/data/raw_weather.parquet
  - Saved 32631312 records to Parquet

Schema:
root
 |-- WBAN: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- StationType: integer (nullable = true)
 |-- SkyCondition: string (nullable = true)
 |-- SkyConditionFlag: string (nullable = true)
 |-- Visibility: double (nullable = true)
 |-- VisibilityFlag: string (nullable = true)
 |-- WeatherType: string (nullable = true)
 |-- WeatherTypeFlag: string (nullable = true)
 |-- DryBulbFarenheit: double (nullable = true)
 |-- DryBulbFarenheitFlag: string (nullable = true)
 |-- DryBulbCelsius: double (nullable = true)
 |-- DryBulbCelsiusFlag: string (nullable = true)
 |-- WetBulbFarenheit: double (nullable = true)
 |-- WetBulbFarenhe

rawWeatherPath = /home/jovyan/work/output/common/data/raw_weather.parquet
weatherDF = [WBAN: string, Date: string ... 42 more fields]


[WBAN: string, Date: string ... 42 more fields]

In [10]:
import org.apache.spark.sql.functions._

weatherDF
  .filter(col("WeatherType").isNotNull && length(trim(col("WeatherType"))) > 0)
  .select("WeatherType", "SkyCondition")
  .show(20, truncate = false)

+-----------+--------------------+
|WeatherType|SkyCondition        |
+-----------+--------------------+
|-RA        |SCT050 BKN100 OVC120|
|-RA        |SCT050 SCT070 BKN090|
|VCTS       |BKN100 BKN120       |
|TS         |SCT070 OVC090       |
|-RA        |SCT042 SCT060 BKN095|
|VCTS       |BKN110              |
|VCTS       |SCT070 BKN100 BKN110|
|VCTSRA     |SCT048 BKN060 OVC090|
|TS         |SCT048 SCT065 BKN110|
|RA         |SCT075 BKN095 OVC110|
|VCTS       |SCT060 SCT075 OVC090|
|TS         |SCT090              |
|TS         |SCT110              |
|TS         |CLR                 |
|VCTS       |SCT075 BKN110       |
|VCTS       |SCT080 BKN100       |
|VCTS       |SCT049 SCT065 BKN110|
|TS         |SCT080 SCT120       |
|-TSRA      |SCT055 SCT065 BKN080|
|TS         |SCT055 SCT065 BKN085|
+-----------+--------------------+
only showing top 20 rows



In [42]:
import com.flightdelay.data.preprocessing.weather.SkyConditionFeatures

val weatherDFWithSkyConditions = SkyConditionFeatures.createSkyConditionFeatures(weatherDF)
weatherDFWithSkyConditions
  .filter(col("WeatherType").isNotNull && length(trim(col("WeatherType"))) > 0)
  .select(
      "WeatherType", 
      "SkyCondition",
      
      "feature_most_critical_sky",
      "feature_num_cloud_layers",
      "feature_cloud_risk_score",
      "feature_has_overcast",
      "feature_has_broken",
      "feature_has_obscured",
      "feature_is_clear",
      "feature_lowest_cloud_height",
      "feature_ceiling",
      "feature_has_low_ceiling"
  )
//.show(20, truncate=false)
.show(1, 0, true)

-RECORD 0-------------------------------------------
 WeatherType                 | -RA                  
 SkyCondition                | SCT050 BKN100 OVC120 
 feature_most_critical_sky   | OVC                  
 feature_num_cloud_layers    | 3                    
 feature_cloud_risk_score    | 4.0                  
 feature_has_overcast        | true                 
 feature_has_broken          | true                 
 feature_has_obscured        | false                
 feature_is_clear            | false                
 feature_lowest_cloud_height | 5000                 
 feature_ceiling             | 10000                
 feature_has_low_ceiling     | 0                    
only showing top 1 row



weatherDFWithSkyConditions = [WBAN: string, Date: string ... 52 more fields]


[WBAN: string, Date: string ... 52 more fields]

In [41]:
import com.flightdelay.data.preprocessing.weather.VisibilityFeatures

val weatherDFWithSkyAndVisibilityConditions = VisibilityFeatures.createVisibilityFeatures(weatherDFWithSkyConditions)

weatherDFWithSkyAndVisibilityConditions
  .filter(col("WeatherType").isNotNull && length(trim(col("WeatherType"))) > 0 && col("Visibility").isNotNull && length(trim(col("Visibility"))) > 0)
  .select(
      "WeatherType", 
      "SkyCondition",
      "Visibility",
      "feature_visibility_miles",
      "feature_visibility_km",
      "feature_visibility_risk_score",
      "feature_is_low_visibility",
      "feature_is_very_low_visibility",
      "feature_visibility_normalized",
      "feature_visibility_inverse",
  )
//.show(20, truncate=false)
.show(1, 0, true)

weatherDFWithSkyAndVisibilityConditions = [WBAN: string, Date: string ... 60 more fields]


-RECORD 0----------------------------------------------
 WeatherType                    | -RA                  
 SkyCondition                   | SCT050 BKN100 OVC120 
 Visibility                     | 10.0                 
 feature_visibility_miles       | 1.0                  
 feature_visibility_km          | 1.609                
 feature_visibility_risk_score  | 3.0                  
 feature_is_low_visibility      | 1                    
 feature_is_very_low_visibility | 0                    
 feature_visibility_normalized  | 0.1                  
 feature_visibility_inverse     | 1.0                  
only showing top 1 row



[WBAN: string, Date: string ... 60 more fields]

In [40]:
import com.flightdelay.data.preprocessing.weather.WeatherInteractionFeatures

val weatherDFWithWeatherInteraction = WeatherInteractionFeatures.createInteractionFeatures(weatherDFWithSkyAndVisibilityConditions)

weatherDFWithWeatherInteraction
  .filter(col("WeatherType").isNotNull && length(trim(col("WeatherType"))) > 0 && col("Visibility").isNotNull && length(trim(col("Visibility"))) > 0)
  .select(
      "WeatherType", 
      "SkyCondition",
      "Visibility",
      "feature_weather_severity_index",
      "feature_is_vfr_conditions",
      "feature_is_ifr_conditions",
      "feature_requires_cat_ii",
      "feature_operations_risk_level",
      "feature_flight_category",
      "feature_flight_category_ordinal",
  )
//.show(20, truncate=false)
.show(1, 0, true)

-RECORD 0-----------------------------------------------
 WeatherType                     | -RA                  
 SkyCondition                    | SCT050 BKN100 OVC120 
 Visibility                      | 10.0                 
 feature_weather_severity_index  | 3.4                  
 feature_is_vfr_conditions       | 0                    
 feature_is_ifr_conditions       | 1                    
 feature_requires_cat_ii         | 0                    
 feature_operations_risk_level   | 2                    
 feature_flight_category         | IFR                  
 feature_flight_category_ordinal | 2                    
only showing top 1 row



weatherDFWithWeatherInteraction = [WBAN: string, Date: string ... 67 more fields]


[WBAN: string, Date: string ... 67 more fields]

In [57]:
import com.flightdelay.data.preprocessing.weather.WeatherTypeFeatureGenerator

val weatherDFWithWeatherType = WeatherTypeFeatureGenerator.createFeatures(weatherDFWithWeatherInteraction)
weatherDFWithWeatherType
  //.filter(col("WeatherType").isNotNull && length(trim(col("WeatherType"))) > 0 && col("Visibility").isNotNull && length(trim(col("Visibility"))) > 0 /**&& col("has_snow") === 1 **/)
  .select(
      "WeatherType", 
      "SkyCondition",
      "Visibility",
      "intensity_heavy",
      "intensity_light",
      "weather_intensity",
      "feature_precipitation_intensity",
      "has_thunderstorm",
      "has_freezing_precip",
      "has_freezing",
      "has_precipitation",
      "has_obscuration",
      "has_visible_moisture",
      "has_hazardous",
      "has_rain",
      "has_snow",
      "has_hail",
      "extracted_codes",
      "weather_hazard_level",
      
      "Icing_Risk_Flag",
      "Icing_Risk_Level",      
  )
.groupBy("has_freezing")
.count().show()

//.show(20, truncate=false)
//.show(10, 0, true)

+------------+--------+
|has_freezing|   count|
+------------+--------+
|           1|   33342|
|           0|32597970|
+------------+--------+



weatherDFWithWeatherType = [WBAN: string, Date: string ... 85 more fields]


[WBAN: string, Date: string ... 85 more fields]

# Wind Features

In [1]:
/**
 |-- WindSpeed: double (nullable = true)
 |-- WindSpeedFlag: string (nullable = true)
 |-- WindDirection: double (nullable = true)
 |-- WindDirectionFlag: string (nullable = true)
 |-- ValueForWindCharacter: string (nullable = true)
 |-- ValueForWindCharacterFlag: string (nullable = true)
**/
 

weatherDF
  .select(
      "WindSpeed", 
      "WindDirection",
      "ValueForWindCharacter"
  ).show()

+---------+-------------+---------------------+
|WindSpeed|WindDirection|ValueForWindCharacter|
+---------+-------------+---------------------+
|      0.0|          0.0|                     |
|      0.0|          0.0|                     |
|      5.0|        120.0|                     |
|      5.0|        130.0|                     |
|      5.0|        130.0|                     |
|      5.0|        110.0|                     |
|      6.0|        130.0|                     |
|      0.0|          0.0|                     |
|      3.0|         80.0|                     |
|      0.0|          0.0|                     |
|      0.0|          0.0|                     |
|      3.0|         80.0|                     |
|      0.0|          0.0|                     |
|      0.0|          0.0|                     |
|      0.0|          0.0|                     |
|      3.0|        110.0|                     |
|      0.0|          0.0|                     |
|      3.0|         80.0|               

In [3]:
import org.apache.spark.sql.functions._

weatherDF
  .groupBy("ValueForWindCharacter")
  .agg(count("*").alias("frequency"))
  .orderBy(desc("frequency"))
  .show(200)

+---------------------+---------+
|ValueForWindCharacter|frequency|
+---------------------+---------+
|                     | 29710817|
|                   18|   263563|
|                   20|   255900|
|                   17|   252372|
|                   21|   250030|
|                   22|   233021|
|                   23|   221298|
|                   16|   207170|
|                   24|   196087|
|                   25|   175156|
|                   26|   150348|
|                   28|   126183|
|                   29|   106303|
|                   30|    87572|
|                   31|    72264|
|                   32|    59547|
|                   33|    48866|
|                   34|    40003|
|                   36|    31415|
|                   37|    25123|
|                   38|    19791|
|                   39|    15746|
|                   40|    13226|
|                   41|    10365|
|                   43|     8293|
|                   44|     6784|
|             