# Weather Dataset Preprocessing and feature Engineering

In [64]:
import org.apache.spark.sql.SparkSession

//Env Configuration
val args: Array[String] = Array("local")

val spark = SparkSession.builder()
  .config(sc.getConf)
  .getOrCreate()

// Rendre la session Spark implicite
implicit val session = spark

// Cellule 4: Test
val weatherFilePath = "../../../../data/FLIGHT-3Y/Weather/201201hourly.txt"

val weatherDF = spark.read.format("csv")
            .option("header", "true")
            .option("inferSchema", "true")
            .load(weatherFilePath)
            .persist()


args = Array(local)
spark = org.apache.spark.sql.SparkSession@1f045e06
session = org.apache.spark.sql.SparkSession@1f045e06
weatherFilePath = ../../../../data/FLIGHT-3Y/Weather/201201hourly.txt
weatherDF = [WBAN: int, Date: int ... 42 more fields]


[WBAN: int, Date: int ... 42 more fields]

In [65]:
weatherDF.count()

4192912

In [43]:
weatherDF.printSchema

root
 |-- WBAN: integer (nullable = true)
 |-- Date: integer (nullable = true)
 |-- Time: integer (nullable = true)
 |-- StationType: integer (nullable = true)
 |-- SkyCondition: string (nullable = true)
 |-- SkyConditionFlag: string (nullable = true)
 |-- Visibility: string (nullable = true)
 |-- VisibilityFlag: string (nullable = true)
 |-- WeatherType: string (nullable = true)
 |-- WeatherTypeFlag: string (nullable = true)
 |-- DryBulbFarenheit: string (nullable = true)
 |-- DryBulbFarenheitFlag: string (nullable = true)
 |-- DryBulbCelsius: string (nullable = true)
 |-- DryBulbCelsiusFlag: string (nullable = true)
 |-- WetBulbFarenheit: string (nullable = true)
 |-- WetBulbFarenheitFlag: string (nullable = true)
 |-- WetBulbCelsius: string (nullable = true)
 |-- WetBulbCelsiusFlag: string (nullable = true)
 |-- DewPointFarenheit: string (nullable = true)
 |-- DewPointFarenheitFlag: string (nullable = true)
 |-- DewPointCelsius: string (nullable = true)
 |-- DewPointCelsiusFlag: str

In [44]:
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.ml.feature.{StringIndexer, OneHotEncoder, VectorAssembler}
import scala.util.Try

## 1. WANDB, Date, Time, StationType

In [45]:
weatherDF.select("WBAN", "Date", "Time", "StationType").show(5)

+----+--------+----+-----------+
|WBAN|    Date|Time|StationType|
+----+--------+----+-----------+
|3011|20120101|  15|          0|
|3011|20120101|  35|          0|
|3011|20120101|  55|          0|
|3011|20120101| 115|          0|
|3011|20120101| 135|          0|
+----+--------+----+-----------+
only showing top 5 rows



## 2.SkyConditionFeatures

In [46]:
weatherDF.select("SkyCondition", "SkyConditionFLag").show(5)


+------------+----------------+
|SkyCondition|SkyConditionFLag|
+------------+----------------+
|         CLR|                |
|         CLR|                |
|         CLR|                |
|         CLR|                |
|         CLR|                |
+------------+----------------+
only showing top 5 rows



In [47]:
weatherDF.select("SkyConditionFLag").distinct().show()

+----------------+
|SkyConditionFLag|
+----------------+
|                |
|               s|
+----------------+



In [48]:
weatherDF.select("SkyCondition").distinct().show(20)

+--------------------+
|        SkyCondition|
+--------------------+
|              BKN120|
|              BKN050|
|BKN013 BKN023 OVC030|
|       FEW090 SCT120|
|BKN025 BKN031 OVC037|
|FEW014 BKN023 OVC110|
|SCT028 BKN047 OVC070|
|       FEW040 BKN120|
|       FEW022 OVC030|
|BKN001 BKN006 OVC015|
|       SCT010 OVC020|
|SCT005 BKN013 BKN019|
|FEW005 SCT008 BKN038|
|       SCT008 OVC023|
|SCT060 BKN070 OVC080|
|SCT026 BKN046 OVC055|
|SCT036 BKN060 OVC080|
|BKN024 BKN031 OVC039|
|       BKN008 OVC013|
|       FEW075 BKN085|
+--------------------+
only showing top 20 rows



In [49]:
    // Créer une vue temporaire
    weatherDF.createOrReplaceTempView("weather")
    
    // Requête SQL pour extraire les trigrammes
    val trigrammesSQL = spark.sql("""
      SELECT DISTINCT 
        SUBSTRING(code, 1, 3) as trigramme
      FROM (
        SELECT 
          EXPLODE(SPLIT(SkyCondition, ' ')) as code
        FROM weather
        WHERE SkyCondition IS NOT NULL
      )
      WHERE LENGTH(code) >= 3
      ORDER BY trigramme
    """)
    
    println("TRIGRAMMES DISTINCTS:")
    trigrammesSQL.show(100, false)

TRIGRAMMES DISTINCTS:
+---------+
|trigramme|
+---------+
|BKN      |
|CLR      |
|FEW      |
|OVC      |
|SCT      |
|VV0      |
+---------+



trigrammesSQL = [trigramme: string]


[trigramme: string]

In [57]:
object SkyConditionFeatures {
  
  /**
   * Extrait le trigramme le plus critique d'une observation SkyCondition
   * Ordre de priorité : VV > OVC > BKN > SCT > FEW > CLR
   */
  val getMostCriticalSky = udf((skyCondition: String) => {
    if (skyCondition == null || skyCondition.trim.isEmpty) {
      "UNKNOWN"
    } else {
      val codes = skyCondition.split(" ").map(_.trim).filter(_.nonEmpty)
      
      // Priorité du pire au meilleur
      if (codes.exists(_.startsWith("VV"))) "VV"
      else if (codes.exists(_.startsWith("OVC"))) "OVC"
      else if (codes.exists(_.startsWith("BKN"))) "BKN"
      else if (codes.exists(_.startsWith("SCT"))) "SCT"
      else if (codes.exists(_.startsWith("FEW"))) "FEW"
      else if (codes.exists(_.startsWith("CLR")) || codes.exists(_.startsWith("SKC"))) "CLR"
      else "UNKNOWN"
    }
  })
  
  /**
   * Extrait l'altitude de la couche nuageuse la plus basse (en pieds)
   */
  val getLowestCloudHeight = udf((skyCondition: String) => {
    if (skyCondition == null || skyCondition.trim.isEmpty) {
      99999 // Valeur sentinelle pour pas de nuages
    } else {
      val codes = skyCondition.split(" ").filter(_.length > 3)
      
      val heights = codes.flatMap { code =>
        Try(code.substring(3).toInt * 100).toOption
      }
      
      if (heights.nonEmpty) heights.min else 99999
    }
  })
  
  /**
   * Calcule l'altitude du plafond (couche BKN ou OVC la plus basse)
   */
  val getCeiling = udf((skyCondition: String) => {
    if (skyCondition == null || skyCondition.trim.isEmpty) {
      99999
    } else {
      val codes = skyCondition.split(" ").filter(_.nonEmpty)
      
      val ceilingCodes = codes.filter(c => 
        c.startsWith("BKN") || c.startsWith("OVC") || c.startsWith("VV")
      )
      
      val ceilings = ceilingCodes.flatMap { code =>
        if (code.startsWith("VV")) Some(0) // Visibilité verticale = plafond 0
        else if (code.length > 3) Try(code.substring(3).toInt * 100).toOption
        else None
      }
      
      if (ceilings.nonEmpty) ceilings.min else 99999
    }
  })
  
  /**
   * Compte le nombre de couches nuageuses
   */
  val countCloudLayers = udf((skyCondition: String) => {
    if (skyCondition == null || skyCondition.trim.isEmpty) {
      0
    } else {
      skyCondition.split(" ").filter(_.nonEmpty).length
    }
  })
  
  /**
   * Calcule un score de risque basé sur la couverture nuageuse (0-5)
   */
  val calculateCloudRiskScore = udf((skyCondition: String) => {
    if (skyCondition == null || skyCondition.trim.isEmpty) {
      1.0 // Risque faible par défaut
    } else {
      val codes = skyCondition.split(" ")
      
      if (codes.exists(_.startsWith("VV"))) 5.0      // Obscuration totale
      else if (codes.exists(_.startsWith("OVC"))) 4.0 // Ciel couvert
      else if (codes.exists(_.startsWith("BKN"))) 3.0 // Fragmenté
      else if (codes.exists(_.startsWith("SCT"))) 2.0 // Épars
      else if (codes.exists(_.startsWith("FEW"))) 1.0 // Quelques nuages
      else if (codes.exists(_.startsWith("CLR"))) 0.0 // Clair
      else 2.0 // Inconnu = risque moyen
    }
  })
  
  /**
   * Détecte si le plafond est bas (< 1000 pieds)
   */
  val hasLowCeiling = udf((ceiling: Int) => {
    ceiling < 1000
  })
  
  /**
   * Applique toutes les transformations pour SkyCondition
   */
  def createSkyConditionFeatures(df: DataFrame): DataFrame = {
    df.withColumn("most_critical_sky", getMostCriticalSky(col("SkyCondition")))
      .withColumn("lowest_cloud_height", getLowestCloudHeight(col("SkyCondition")))
      .withColumn("ceiling", getCeiling(col("SkyCondition")))
      .withColumn("num_cloud_layers", countCloudLayers(col("SkyCondition")))
      .withColumn("cloud_risk_score", calculateCloudRiskScore(col("SkyCondition")))
      .withColumn("has_overcast", col("SkyCondition").contains("OVC"))
      .withColumn("has_broken", col("SkyCondition").contains("BKN"))
      .withColumn("has_obscured", col("SkyCondition").contains("VV"))
      .withColumn("is_clear", col("SkyCondition").contains("CLR") || col("SkyCondition").contains("SKC"))
      .withColumn("has_low_ceiling", hasLowCeiling(col("ceiling")))
  }
}

object WeatherFeaturePipeline {
  
  import org.apache.spark.sql.functions._
  import org.apache.spark.sql.types._
  
  /**
   * Applique toutes les transformations de features météo
   */
  def applyWeatherFeatures(df: DataFrame): DataFrame = {
    df.transform(SkyConditionFeatures.createSkyConditionFeatures)
  }
}

var weatherWithFeatureDF = WeatherFeaturePipeline.applyWeatherFeatures(weatherDF)

weatherWithFeatureDF.select(
    "SkyCondition", 
    "most_critical_sky",
    "lowest_cloud_height",
    "ceiling",
    "num_cloud_layers",
    "cloud_risk_score",
    "has_overcast",
    "has_broken",
    "has_obscured",
    "is_clear",
    "has_low_ceiling").show(20)

defined object SkyConditionFeatures
defined object WeatherFeaturePipeline
weatherWithFeatureDF = [WBAN: int, Date: int ... 52 more fields]


+------------+-----------------+-------------------+-------+----------------+----------------+------------+----------+------------+--------+---------------+
|SkyCondition|most_critical_sky|lowest_cloud_height|ceiling|num_cloud_layers|cloud_risk_score|has_overcast|has_broken|has_obscured|is_clear|has_low_ceiling|
+------------+-----------------+-------------------+-------+----------------+----------------+------------+----------+------------+--------+---------------+
|         CLR|              CLR|              99999|  99999|               1|             0.0|       false|     false|       false|    true|          false|
|         CLR|              CLR|              99999|  99999|               1|             0.0|       false|     false|       false|    true|          false|
|         CLR|              CLR|              99999|  99999|               1|             0.0|       false|     false|       false|    true|          false|
|         CLR|              CLR|              99999|  9999

[WBAN: int, Date: int ... 52 more fields]

## 3.VisibilityFeatures

In [27]:
weatherDF.select("Visibility", "VisibilityFLag").show(5)

+----------+--------------+
|Visibility|VisibilityFLag|
+----------+--------------+
|     10.00|              |
|     10.00|              |
|     10.00|              |
|     10.00|              |
|     10.00|              |
+----------+--------------+
only showing top 5 rows



In [28]:
weatherDF.select("VisibilityFLag").distinct().show()

+--------------+
|VisibilityFLag|
+--------------+
|              |
|             s|
+--------------+



In [32]:
import org.apache.spark.sql.functions.col

weatherDF.select("Visibility")
  .distinct()
  .orderBy(col("Visibility").desc)
  .show()

weatherDF.select("Visibility")
  .distinct()
  .orderBy(col("Visibility").asc)
  .show()

+----------+
|Visibility|
+----------+
|         M|
|     90.00|
|     85.00|
|     80.00|
|     70.00|
|     67.00|
|     60.00|
|     50.00|
|     45.00|
|     40.00|
|     35.00|
|     32.00|
|     30.00|
|     25.00|
|     20.00|
|     19.00|
|     18.00|
|     17.00|
|     15.00|
|     14.00|
+----------+
only showing top 20 rows

+----------+
|Visibility|
+----------+
|      0.00|
|      0.12|
|      0.25|
|      0.38|
|      0.50|
|      0.63|
|      0.75|
|      0.87|
|      1.00|
|      1.25|
|      1.50|
|      1.75|
|      2.00|
|      2.50|
|      3.00|
|      4.00|
|      5.00|
|      6.00|
|      7.00|
|      8.00|
+----------+
only showing top 20 rows



In [59]:
object VisibilityFeatures {
  
  import org.apache.spark.sql.functions._
  import scala.util.Try
  
  /**
   * Nettoie et convertit la visibilité en miles
   * Hypothèse : les valeurs sont en dixièmes de miles
   */
  val cleanVisibility = udf((visibility: String) => {
    if (visibility == null || visibility.trim.isEmpty || visibility == "M") {
      10.0 // Valeur par défaut pour données manquantes (bonne visibilité)
    } else {
      Try(visibility.toDouble).toOption match {
        case Some(v) => {
          val miles = v / 10.0 // Conversion depuis dixièmes de miles
          math.min(miles, 10.0) // Plafonner à 10 miles max
        }
        case None => 10.0
      }
    }
  })
  
  /**
   * Catégorise la visibilité selon les standards aviation
   */
  val categorizeVisibility = udf((visibilityMiles: Double) => {
    visibilityMiles match {
      case v if v < 0.5  => "LIFR"      // Low IFR
      case v if v < 1.0  => "IFR_LOW"   // IFR bas
      case v if v < 3.0  => "IFR"       // Instrument Flight Rules
      case v if v < 5.0  => "MVFR"      // Marginal VFR
      case v if v < 10.0 => "VFR"       // Visual Flight Rules
      case _             => "VFR_HIGH"  // > 10 miles, excellente
    }
  })
  
  /**
   * Calcule un score de risque basé sur la visibilité (0-5)
   */
  val calculateVisibilityRiskScore = udf((visibilityMiles: Double) => {
    visibilityMiles match {
      case v if v < 0.5  => 5.0  // Très dangereux
      case v if v < 1.0  => 4.0  // Dangereux
      case v if v < 3.0  => 3.0  // Modéré (IFR)
      case v if v < 5.0  => 2.0  // Léger (MVFR)
      case v if v < 10.0 => 1.0  // Faible (VFR)
      case _             => 0.0  // Aucun risque
    }
  })
  
  /**
   * Détecte les conditions de faible visibilité (< 3 miles)
   */
  val isLowVisibility = udf((visibilityMiles: Double) => {
    visibilityMiles < 3.0
  })
  
  /**
   * Détecte les conditions de très faible visibilité (< 1 mile)
   */
  val isVeryLowVisibility = udf((visibilityMiles: Double) => {
    visibilityMiles < 1.0
  })
  
  /**
   * Applique toutes les transformations pour Visibility
   */
  def createVisibilityFeatures(df: DataFrame): DataFrame = {
    import org.apache.spark.sql.functions._
    
    df.withColumn("visibility_miles", cleanVisibility(col("Visibility")))
      .withColumn("visibility_km", col("visibility_miles") * 1.609)
      .withColumn("visibility_category", categorizeVisibility(col("visibility_miles")))
      .withColumn("visibility_risk_score", calculateVisibilityRiskScore(col("visibility_miles")))
      .withColumn("is_low_visibility", isLowVisibility(col("visibility_miles")))
      .withColumn("is_very_low_visibility", isVeryLowVisibility(col("visibility_miles")))
      .withColumn("visibility_normalized", 
        when(col("visibility_miles") > 0, col("visibility_miles") / 10.0)
          .otherwise(0.0))
      .withColumn("visibility_inverse", 
        when(col("visibility_miles") > 0, lit(1.0) / col("visibility_miles"))
          .otherwise(10.0))
  }
}

weatherWithFeatureDF = VisibilityFeatures.createVisibilityFeatures(weatherWithFeatureDF)

weatherWithFeatureDF.select(
    "Visibility",
    "visibility_miles", 
    "visibility_km",
    "visibility_category",
    "visibility_risk_score",
    "is_very_low_visibility",
    "visibility_normalized",
    "visibility_inverse").show(20)

defined object VisibilityFeatures
weatherWithFeatureDF = [WBAN: int, Date: int ... 60 more fields]


+----------+----------------+-------------+-------------------+---------------------+----------------------+---------------------+------------------+
|Visibility|visibility_miles|visibility_km|visibility_category|visibility_risk_score|is_very_low_visibility|visibility_normalized|visibility_inverse|
+----------+----------------+-------------+-------------------+---------------------+----------------------+---------------------+------------------+
|     10.00|             1.0|        1.609|                IFR|                  3.0|                 false|                  0.1|               1.0|
|     10.00|             1.0|        1.609|                IFR|                  3.0|                 false|                  0.1|               1.0|
|     10.00|             1.0|        1.609|                IFR|                  3.0|                 false|                  0.1|               1.0|
|     10.00|             1.0|        1.609|                IFR|                  3.0|               

[WBAN: int, Date: int ... 60 more fields]

## 4. WeatherInteractionFeatures

In [60]:
object WeatherInteractionFeatures {
  
  import org.apache.spark.sql.functions._
  
  /**
   * Calcule un indice de sévérité météorologique combiné
   * Prend en compte à la fois la couverture nuageuse et la visibilité
   */
  val calculateWeatherSeverityIndex = udf(
    (cloudRisk: Double, visibilityRisk: Double, ceiling: Int, visibility: Double) => {
      // Combinaison pondérée
      val baseScore = (cloudRisk * 0.4) + (visibilityRisk * 0.6)
      
      // Pénalité pour plafond très bas
      val ceilingPenalty = if (ceiling < 500) 2.0 
                          else if (ceiling < 1000) 1.0 
                          else 0.0
      
      // Pénalité pour visibilité très basse
      val visibilityPenalty = if (visibility < 0.5) 2.0 
                             else if (visibility < 1.0) 1.0 
                             else 0.0
      
      math.min(baseScore + ceilingPenalty + visibilityPenalty, 10.0)
    }
  )
  
  /**
   * Détermine si les conditions sont VFR (Visual Flight Rules)
   */
  val isVFRConditions = udf((visibility: Double, ceiling: Int) => {
    visibility >= 5.0 && ceiling >= 3000
  })
  
  /**
   * Détermine si les conditions sont IFR (Instrument Flight Rules)
   */
  val isIFRConditions = udf((visibility: Double, ceiling: Int) => {
    visibility < 3.0 || ceiling < 1000
  })
  
  /**
   * Détermine si CAT II/III est requis
   */
  val requiresCATII = udf((visibility: Double, ceiling: Int) => {
    visibility < 1.0 || ceiling < 200
  })
  
  /**
   * Calcule le niveau de risque opérationnel (0-4)
   * 0=None, 1=Low, 2=Moderate, 3=High, 4=Critical
   */
  val calculateOperationsRiskLevel = udf(
    (visibility: Double, ceiling: Int, hasObscured: Boolean) => {
      if (hasObscured || visibility < 0.25 || ceiling < 100) 4 // Critical
      else if (visibility < 0.5 || ceiling < 200) 3 // High
      else if (visibility < 1.0 || ceiling < 500) 3 // High
      else if (visibility < 3.0 || ceiling < 1000) 2 // Moderate
      else if (visibility < 5.0 || ceiling < 3000) 1 // Low
      else 0 // None
    }
  )
  
  /**
   * Applique toutes les features d'interaction
   */
  def createInteractionFeatures(df: DataFrame): DataFrame = {
    import org.apache.spark.sql.functions._
    
    df.withColumn("weather_severity_index", 
        calculateWeatherSeverityIndex(
          col("cloud_risk_score"),
          col("visibility_risk_score"),
          col("ceiling"),
          col("visibility_miles")
        ))
      .withColumn("is_vfr_conditions", 
        isVFRConditions(col("visibility_miles"), col("ceiling")))
      .withColumn("is_ifr_conditions", 
        isIFRConditions(col("visibility_miles"), col("ceiling")))
      .withColumn("requires_cat_ii", 
        requiresCATII(col("visibility_miles"), col("ceiling")))
      .withColumn("operations_risk_level", 
        calculateOperationsRiskLevel(
          col("visibility_miles"),
          col("ceiling"),
          col("has_obscured")
        ))
  }
}

weatherWithFeatureDF = WeatherInteractionFeatures.createInteractionFeatures(weatherWithFeatureDF)

weatherWithFeatureDF.select(
    "SkyCondition",
    "Visibility",
    "weather_severity_index", 
    "is_vfr_conditions",
    "is_ifr_conditions",
    "requires_cat_ii",
    "visibility_miles",
    "operations_risk_level").show(20)

defined object WeatherInteractionFeatures
weatherWithFeatureDF = [WBAN: int, Date: int ... 65 more fields]


+------------+----------+----------------------+-----------------+-----------------+---------------+----------------+---------------------+
|SkyCondition|Visibility|weather_severity_index|is_vfr_conditions|is_ifr_conditions|requires_cat_ii|visibility_miles|operations_risk_level|
+------------+----------+----------------------+-----------------+-----------------+---------------+----------------+---------------------+
|         CLR|     10.00|    1.7999999999999998|            false|             true|          false|             1.0|                    2|
|         CLR|     10.00|    1.7999999999999998|            false|             true|          false|             1.0|                    2|
|         CLR|     10.00|    1.7999999999999998|            false|             true|          false|             1.0|                    2|
|         CLR|     10.00|    1.7999999999999998|            false|             true|          false|             1.0|                    2|
|         CLR|     1

[WBAN: int, Date: int ... 65 more fields]

# FlightCategoryFeature - Proposé par Naveed

In [62]:
object FlightCategoryFeatures {
  
  import org.apache.spark.sql.functions._
  
  /**
   * Calcule la catégorie de vol basée sur la visibilité et le plafond
   * Selon les règles FAA pour les opérations aériennes
   * 
   * @return String: "L-IFR", "IFR", "MVFR", "VFR"
   */
  val calculateFlightCategory = udf((visibilityMiles: Double, ceiling: Int) => {
    // L-IFR (Low Instrument Flight Rules) - Highest Severity
    // Visibility < 1 mile OR ceiling < 500 feet
    if (visibilityMiles < 1.0 || ceiling < 500) {
      "L-IFR"
    }
    // IFR (Instrument Flight Rules) - High Severity
    // 1 <= Visibility < 3 miles OR 500 <= ceiling < 1,000 feet
    else if ((visibilityMiles >= 1.0 && visibilityMiles < 3.0) || 
             (ceiling >= 500 && ceiling < 1000)) {
      "IFR"
    }
    // MVFR (Marginal Visual Flight Rules) - Moderate Severity
    // 3 <= Visibility <= 5 miles OR 1,000 <= ceiling <= 3,000 feet
    else if ((visibilityMiles >= 3.0 && visibilityMiles <= 5.0) || 
             (ceiling >= 1000 && ceiling <= 3000)) {
      "MVFR"
    }
    // VFR (Visual Flight Rules) - Low Severity
    // Visibility > 5 miles AND ceiling > 3,000 feet
    else {
      "VFR"
    }
  })
  
  /**
   * Calcule l'index ordinal de la catégorie de vol
   * Plus le score est élevé, plus les conditions sont sévères
   * 
   * @return Integer: 3 (L-IFR), 2 (IFR), 1 (MVFR), 0 (VFR)
   */
  val calculateFlightCategoryIndex = udf((visibilityMiles: Double, ceiling: Int) => {
    if (visibilityMiles < 1.0 || ceiling < 500) {
      3  // L-IFR - Highest Severity
    }
    else if ((visibilityMiles >= 1.0 && visibilityMiles < 3.0) || 
             (ceiling >= 500 && ceiling < 1000)) {
      2  // IFR - High Severity
    }
    else if ((visibilityMiles >= 3.0 && visibilityMiles <= 5.0) || 
             (ceiling >= 1000 && ceiling <= 3000)) {
      1  // MVFR - Moderate Severity
    }
    else {
      0  // VFR - Low Severity
    }
  })
  
  /**
   * Version alternative qui prend la catégorie en String et retourne l'index
   */
  val flightCategoryToIndex = udf((category: String) => {
    category match {
      case "L-IFR" => 3
      case "IFR"   => 2
      case "MVFR"  => 1
      case "VFR"   => 0
      case _       => 1  // Default: Moderate severity
    }
  })
  
  /**
   * Crée des features booléennes pour chaque catégorie
   */
  val isLIFR = udf((category: String) => category == "L-IFR")
  val isIFR = udf((category: String) => category == "IFR")
  val isMVFR = udf((category: String) => category == "MVFR")
  val isVFR = udf((category: String) => category == "VFR")
  
  /**
   * Détermine si les conditions nécessitent des procédures instrumentales
   * (L-IFR ou IFR)
   */
  val requiresInstrumentProcedures = udf((categoryIndex: Int) => {
    categoryIndex >= 2  // L-IFR (3) ou IFR (2)
  })
  
  /**
   * Détermine la sévérité en texte
   */
  val getSeverityLevel = udf((categoryIndex: Int) => {
    categoryIndex match {
      case 3 => "HIGHEST"   // L-IFR
      case 2 => "HIGH"      // IFR
      case 1 => "MODERATE"  // MVFR
      case 0 => "LOW"       // VFR
      case _ => "UNKNOWN"
    }
  })
  
  /**
   * Applique toutes les transformations pour Flight Category
   * Nécessite que visibility_miles et ceiling soient déjà calculés
   */
  def createFlightCategoryFeatures(df: DataFrame): DataFrame = {
    import org.apache.spark.sql.functions._
    
    df.withColumn("flight_category", 
        calculateFlightCategory(col("visibility_miles"), col("ceiling")))
      .withColumn("flight_category_index", 
        calculateFlightCategoryIndex(col("visibility_miles"), col("ceiling")))
      .withColumn("severity_level", 
        getSeverityLevel(col("flight_category_index")))
      .withColumn("is_lifr", isLIFR(col("flight_category")))
      .withColumn("is_ifr", isIFR(col("flight_category")))
      .withColumn("is_mvfr", isMVFR(col("flight_category")))
      .withColumn("is_vfr", isVFR(col("flight_category")))
      .withColumn("requires_instrument_procedures", 
        requiresInstrumentProcedures(col("flight_category_index")))
  }
}

weatherWithFeatureDF = FlightCategoryFeatures.createFlightCategoryFeatures(weatherWithFeatureDF)

weatherWithFeatureDF.select(
    "SkyCondition",
    "Visibility",
    "flight_category", 
    "flight_category_index",
    "is_lifr",
    "is_ifr",
    "is_mvfr",
    "is_vfr",
    "requires_instrument_procedures").show(1000)


+--------------------+----------+---------------+---------------------+-------+------+-------+------+------------------------------+
|        SkyCondition|Visibility|flight_category|flight_category_index|is_lifr|is_ifr|is_mvfr|is_vfr|requires_instrument_procedures|
+--------------------+----------+---------------+---------------------+-------+------+-------+------+------------------------------+
|                 CLR|     10.00|            IFR|                    2|  false|  true|  false| false|                          true|
|                 CLR|     10.00|            IFR|                    2|  false|  true|  false| false|                          true|
|                 CLR|     10.00|            IFR|                    2|  false|  true|  false| false|                          true|
|                 CLR|     10.00|            IFR|                    2|  false|  true|  false| false|                          true|
|                 CLR|     10.00|            IFR|                    

defined object FlightCategoryFeatures
weatherWithFeatureDF = [WBAN: int, Date: int ... 73 more fields]


[WBAN: int, Date: int ... 73 more fields]