# Extracting Airport Data

Libraries Necessary to Extract Airport information

In [120]:
import requests
from pyspark.sql import SparkSession
from pyspark.sql import Row
from datetime import datetime
from pyspark.sql.functions import coalesce
from pyspark.sql.types import (
    StructType, StructField, StringType, LongType, DoubleType, BooleanType, ArrayType
)
import os
from dotenv import load_dotenv
load_dotenv()
PYTHONDIRECTORY = os.getenv("PYTHONDIRECTORY")
os.environ["PYSPARK_PYTHON"] = PYTHONDIRECTORY

## Building Session to create dataframe

In [121]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("AirportData") \
    .config("spark.pyspark.python", PYTHONDIRECTORY) \
    .getOrCreate()

In [122]:
df = spark.read.csv("../files/airports.csv", header=True, inferSchema=True)
print(type(df))

<class 'pyspark.sql.dataframe.DataFrame'>


## Sample of Dataframe

In [123]:
df.show()

+------+-----+-------------+--------------------+------------------+-------------------+------------+---------+-----------+----------+------------+-----------------+---------+---------+--------+----------+--------------------+--------------------+-----------+
|    id|ident|         type|                name|      latitude_deg|      longitude_deg|elevation_ft|continent|iso_country|iso_region|municipality|scheduled_service|icao_code|iata_code|gps_code|local_code|           home_link|      wikipedia_link|   keywords|
+------+-----+-------------+--------------------+------------------+-------------------+------------+---------+-----------+----------+------------+-----------------+---------+---------+--------+----------+--------------------+--------------------+-----------+
|  6523|  00A|     heliport|   Total RF Heliport|         40.070985|         -74.933689|          11|       NA|         US|     US-PA|    Bensalem|               no|     NULL|     NULL|    K00A|       00A|https://www.pen

# Extracting and Parsing Airport Weather Data

This is ultimately the process that will be used to extract updated weather data within databricks for joining relevant weather data with airports.

## Libraries Necessary to extract weather data

In [124]:
import ftplib
import io
import re

## Function to download weather data.

In [125]:
def download_metar_file(hour_utc="00"):
    filename = f"{hour_utc}Z.TXT"
    host = "tgftp.nws.noaa.gov"
    dir_path = "/data/observations/metar/cycles/"

    # Connect to FTP
    ftp = ftplib.FTP(host)
    ftp.login()
    ftp.cwd(dir_path)

    # Download file content
    r = io.BytesIO()
    ftp.retrbinary(f"RETR {filename}", r.write)
    ftp.quit()
    
    # Convert bytes to string
    content = r.getvalue().decode("utf-8", errors="ignore")
    return content

## Download Weather Data

In [126]:
# Example: pull METARs from 00Z
metar_data = download_metar_file("00")
print(metar_data[:1000])  # preview first 1000 chars

2025/03/22 23:45
KGUL 222345Z AUTO 14009KT 6SM HZ FEW030 23/18 A3000 RMK A01

2025/03/22 23:50
KIKT 222350Z AUTO 14007KT 10SM CLR 19/00 A3013 RMK A01

2025/03/22 23:50
KANR 222350Z AUTO 15020KT 10SM FEW028 BKN038 OVC047 23/16 A3016 RMK A01

2025/03/22 23:50
KXER 222350Z AUTO 13015KT 9SM 22/17 A3008 RMK A01

2025/03/22 23:45
KGUL 222345Z AUTO 14009KT 6SM HZ FEW030 23/18 A3000 RMK A01

2025/03/22 23:50
KANR 222350Z AUTO 15020KT 10SM FEW028 BKN038 OVC047 23/16 A3016 RMK A01

2025/03/22 23:50
KIKT 222350Z AUTO 14007KT 10SM CLR 19/00 A3013 RMK A01

2025/03/22 23:50
KXER 222350Z AUTO 13015KT 9SM 22/17 A3008 RMK A01

2025/03/22 23:45
KGUL 222345Z AUTO 14009KT 6SM HZ FEW030 23/18 A3000 RMK A01

2025/03/22 23:50
KIKT 222350Z AUTO 14007KT 10SM CLR 19/00 A3013 RMK A01

2025/03/22 23:55
KBFR 222355Z AUTO 00000KT 10SM CLR 10/M10 A3007 RMK AO2

2025/03/22 23:47
KEVB 222347Z 16005KT 10SM CLR 18/09 A3011 

2025/03/22 23:55
KBFR 222355Z AUTO 00000KT 10SM CLR 10/M10 A3007 RMK AO2

2025/03/22 23:47
KEVB 

## Parse Weather Data

In [127]:
def parse_metar_entries(raw_data):
    entries = raw_data.strip().split("\n\n")  # Split by METAR blocks
    parsed = []

    for entry in entries:
        lines = entry.strip().split("\n")
        if len(lines) < 2:
            continue  # Skip bad entries
        try:
            timestamp_str = lines[0]
            metar_line = lines[1]
            icao = metar_line.split()[0]
            timestamp = datetime.strptime(timestamp_str, "%Y/%m/%d %H:%M")
            parsed.append(Row(icao_code=icao, metar=metar_line, timestamp=timestamp))
        except Exception as e:
            print(f"Skipping entry due to error: {e}")
            continue
    
    return parsed

In [128]:
# Parse the entries into Spark-friendly rows
parsed_rows = parse_metar_entries(metar_data)

In [129]:
# Create DataFrame
weather_df = spark.createDataFrame(parsed_rows)
weather_df.show(truncate=False)

+---------+-----------------------------------------------------------------------------+-------------------+
|icao_code|metar                                                                        |timestamp          |
+---------+-----------------------------------------------------------------------------+-------------------+
|KGUL     |KGUL 222345Z AUTO 14009KT 6SM HZ FEW030 23/18 A3000 RMK A01                  |2025-03-22 23:45:00|
|KIKT     |KIKT 222350Z AUTO 14007KT 10SM CLR 19/00 A3013 RMK A01                       |2025-03-22 23:50:00|
|KANR     |KANR 222350Z AUTO 15020KT 10SM FEW028 BKN038 OVC047 23/16 A3016 RMK A01      |2025-03-22 23:50:00|
|KXER     |KXER 222350Z AUTO 13015KT 9SM 22/17 A3008 RMK A01                            |2025-03-22 23:50:00|
|KGUL     |KGUL 222345Z AUTO 14009KT 6SM HZ FEW030 23/18 A3000 RMK A01                  |2025-03-22 23:45:00|
|KANR     |KANR 222350Z AUTO 15020KT 10SM FEW028 BKN038 OVC047 23/16 A3016 RMK A01      |2025-03-22 23:50:00|
|KIKT     

## Parse Metar Data

In [130]:
def safe_parse_metar(metar_row):
    try:
        metar = metar_row.metar
        icao = metar_row.icao_code
        timestamp = metar_row.timestamp

        # REGEX patterns
        wind_pattern = r"(\d{3}|VRB)(\d{2})KT"
        visibility_pattern = r"(\d{1,2})SM"
        temp_dew_pattern = r"(M?\d{2})/(M?\d{2})"
        altimeter_pattern = r"A(\d{4})"

        # Match and parse
        wind_match = re.search(wind_pattern, metar)
        vis_match = re.search(visibility_pattern, metar)
        temp_match = re.search(temp_dew_pattern, metar)
        alt_match = re.search(altimeter_pattern, metar)

        def parse_temp(val):
            return -int(val[1:]) if val.startswith("M") else int(val)

        wind_dir = int(wind_match.group(1).replace("VRB", "0")) if wind_match else None
        wind_speed = int(wind_match.group(2)) if wind_match else None
        wind_speed_mph = round(wind_speed * 1.15078, 2) if wind_speed is not None else None

        visibility = int(vis_match.group(1)) if vis_match else None
        temperature = parse_temp(temp_match.group(1)) if temp_match else None
        dewpoint = parse_temp(temp_match.group(2)) if temp_match else None
        altimeter = float(alt_match.group(1)) / 100 if alt_match else None

        return Row(
            icao_code=icao,
            timestamp=timestamp,
            metar=metar,
            wind_dir=wind_dir,
            wind_speed=wind_speed,
            wind_speed_mph=wind_speed_mph,
            visibility=visibility,
            temperature=temperature,
            dewpoint=dewpoint,
            altimeter=altimeter
        )

    except Exception as e:
        print(f"[Parse Error] ICAO={metar_row.icao_code}, Error={str(e)}")
        return None

## Sample of Dataframe

In [131]:
# Apply with safety
parsed_weather_rdd = weather_df.rdd.map(safe_parse_metar).filter(lambda row: row is not None)

# Create DataFrame
parsed_weather_df = spark.createDataFrame(parsed_weather_rdd)

# Drop METAR column
parsed_weather_df = parsed_weather_df.drop("metar")

# Drop duplicates
parsed_weather_df = parsed_weather_df.dropDuplicates(["icao_code", "timestamp"])

#Drop null values
parsed_weather_df = parsed_weather_df.dropna()

# Test it (should NOT crash now)
parsed_weather_df.show(truncate=False)

+---------+-------------------+--------+----------+--------------+----------+-----------+--------+---------+
|icao_code|timestamp          |wind_dir|wind_speed|wind_speed_mph|visibility|temperature|dewpoint|altimeter|
+---------+-------------------+--------+----------+--------------+----------+-----------+--------+---------+
|PAHO     |2025-03-22 23:53:00|0       |3         |3.45          |10        |6          |1       |29.48    |
|KVNW     |2025-03-22 23:55:00|340     |8         |9.21          |10        |4          |-8      |30.07    |
|KIOW     |2025-03-22 23:52:00|140     |6         |6.9           |10        |10         |-7      |29.97    |
|KJNX     |2025-03-22 23:50:00|230     |5         |5.75          |10        |19         |0       |29.89    |
|KSIF     |2025-03-22 23:45:00|270     |5         |5.75          |10        |18         |-6      |29.91    |
|KNZY     |2025-03-22 23:52:00|200     |7         |8.06          |8         |16         |14      |29.95    |
|KORF     |2025-03-

# Joining Weather Data With Airport Data

**Overview**

This is ultimately the process that will be utilized within databricks to automatically update and create the processed flight data. (processed_airports.csv)

Renaming gps_code to icao_code for airports data

In [132]:
airports_df = df.withColumnRenamed("gps_code", "icao_code_1")
# Combine the two icao_code columns into one
airports_df = airports_df.withColumn("icao_code_combined", coalesce(airports_df["icao_code"], airports_df["icao_code_1"]))
airports_df = airports_df.drop("icao_code", "icao_code_1")
airports_df = airports_df.withColumnRenamed("icao_code_combined", "icao_code")

In [133]:
airports_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- ident: string (nullable = true)
 |-- type: string (nullable = true)
 |-- name: string (nullable = true)
 |-- latitude_deg: double (nullable = true)
 |-- longitude_deg: double (nullable = true)
 |-- elevation_ft: integer (nullable = true)
 |-- continent: string (nullable = true)
 |-- iso_country: string (nullable = true)
 |-- iso_region: string (nullable = true)
 |-- municipality: string (nullable = true)
 |-- scheduled_service: string (nullable = true)
 |-- iata_code: string (nullable = true)
 |-- local_code: string (nullable = true)
 |-- home_link: string (nullable = true)
 |-- wikipedia_link: string (nullable = true)
 |-- keywords: string (nullable = true)
 |-- icao_code: string (nullable = true)



Separating airports with icao_code, from airports without icao_code.

In [134]:
airports_with_icao_df = airports_df.filter(airports_df['icao_code'].isNotNull())

In [135]:
airports_with_weather_df = airports_with_icao_df.join(parsed_weather_df, on="icao_code", how="inner")

In [136]:
#drop airport duplicates
airports_with_weather_df = airports_with_weather_df.dropDuplicates(["icao_code"])

## Sample of Dataframe

In [137]:
airports_with_weather_df.show(truncate=False)

+---------+----+-----+--------------+-------------------------------------+------------------+-------------------+------------+---------+-----------+----------+------------------+-----------------+---------+----------+--------------------------+-------------------------------------------------------------------+----------------------+-------------------+--------+----------+--------------+----------+-----------+--------+---------+
|icao_code|id  |ident|type          |name                                 |latitude_deg      |longitude_deg      |elevation_ft|continent|iso_country|iso_region|municipality      |scheduled_service|iata_code|local_code|home_link                 |wikipedia_link                                                     |keywords              |timestamp          |wind_dir|wind_speed|wind_speed_mph|visibility|temperature|dewpoint|altimeter|
+---------+----+-----+--------------+-------------------------------------+------------------+-------------------+------------+-----

# Dataframe Analysis

Comparing Data Completeness

In [138]:
print("Data Completeness")
print(f"Total airports: {airports_df.count()}")
print(f"Total airports with icao: {airports_with_icao_df.count()}")
print(f"Total weather entries: {parsed_weather_df.count()}")
print(f"Total airports with weather: {airports_with_weather_df.count()}")

Data Completeness
Total airports: 82775
Total airports with icao: 43166
Total weather entries: 4391
Total airports with weather: 1947


Notes: Regarding the above information here are some important factors in play.

__ICAO Coverage__: 52% of airports in the dataset have ICAO codes, enabling global weather integration using aviation-standard codes.
- Many airports don’t report weather hourly
- METAR feed is ~last hour only
- Not all ICAO codes are active stations



# Separating and saving Data

## Saving Airports with ICAO for weather updates

In [143]:
import pandas as pd

In [144]:
# Saving Airports with icao codes for future weather updates to csv
airports_with_icao_df.toPandas().to_csv("../files/airports_with_icao.csv", index=False)