## Enrich Flight Delay Data with Weather
This only needs to be run once.
Currently the weather features I want to enrich our dataset with are:
- Origin_Windspeed: wind (mph) at time of departure (from departing Airport)
- Origin_Precip: precipitation (inches) at time of departure (from departing Airport)
- Dest_Windspeed: wind (mph) at time of arrival (from arriving Airport)
- Dest_Precip: precipitation (inches) at time of arrival (from arriving Airport)

This is code is currently unoptimized. But for POC, I extracted 1k rows.

In [40]:
import pandas as pd
import pytz
from datetime import datetime
import requests

filename = "flight_data.csv"
filepath = "../data/"

In [3]:
flight_df = pd.read_csv(filepath + filename)

In [147]:
new_columns = ["Origin_Windspeed", "Origin_Precip", "Dest_Windspeed", "Dest_Precip"]
if set(new_columns).issubset(flight_df.columns):
    print("Weather data already added")
else:
    # New Columns
    flight_df["Origin_Windspeed"] = None
    flight_df["Origin_Precip"] = None
    flight_df["Dest_Windspeed"] = None
    flight_df["Dest_Precip"] = None

Weather data already added


In [159]:

airport_url = "https://api.weather.com/v3/location/search"

def retrieve_timezone(airport_code):
    """
    Retrieves the timezone and ICAO code for a given airport code.

    :param airport_code: The IATA or ICAO airport code.
    :return: A tuple containing the IANA timezone and ICAO code of the airport, or (None, None) if not found.
    """
    params = {
        "apiKey": "e1f10a1e78da46f5b10a1e78da96f525",
        "language": "en-US",
        "locationType": "airport",
        "format": "json",
        "countryCode": "US",
        "query": airport_code
    }
    try:
        response = requests.get(airport_url, params=params)
        response.raise_for_status()  # Raises HTTPError for bad responses
        response_data = response.json()

        if 'errors' in response_data:
            print("Error: ", response_data['errors'][0]['error']['message'])
            return None, None

        for i, loc_type in enumerate(response_data['location']['type']):
            if (loc_type == 'airport' and 
                response_data['location']['icaoCode'][i] is not None and
                airport_code in response_data['location']['icaoCode'][i]):
                local_timezone = response_data['location']['ianaTimeZone'][i]
                icao_code = response_data['location']['icaoCode'][i]
                return local_timezone, icao_code
        print("No Valid Airport Found")
        return None, None
    except requests.RequestException as e:
        print(f"Request error: {e}")
        return None, None

In [150]:
def convert_to_gmt(flight_date, local_time, local_timezone):
    """
    Convert local time to GMT.

    :param flight_date: Date of the flight in 'YYYY-MM-DD' format.
    :param local_time: Time of the flight in 'HHMM' format.
    :param local_timezone: Timezone of the local time.
    :return: GMT Unix timestamp of the flight.
    """
    # Format local time into 'HH:MM' format
    str_local_time = str(int(local_time))
    if(len(str_local_time) < 4):
        str_local_time = "0" * (4 - len(str_local_time)) + str_local_time

    formatted_local_time = str_local_time[0:2] + ":" + str_local_time[2:4]

    # Combine flight date and time
    combined_datetime_str = f"{flight_date} {formatted_local_time}"

    # Convert to local datetime object
    local_datetime = datetime.strptime(combined_datetime_str, "%Y-%m-%d %H:%M")

    # Assign local timezone
    local_timezone = pytz.timezone(local_timezone)
    local_datetime_with_tz = local_timezone.localize(local_datetime)

    # Convert to GMT
    gmt_datetime = local_datetime_with_tz.astimezone(pytz.utc)

    return int(gmt_datetime.timestamp())

In [151]:
def retrieve_weather_info(airport_code, desired_timestamp):
    api_url = f"https://api.weather.com/v1/location/{airport_code}:9:US/observations/historical.json"

    # Convert Unix timestamp to datetime
    dt = datetime.utcfromtimestamp(desired_timestamp)
    # Format datetime to "YYYYMMDD"
    search_date = dt.strftime("%Y%m%d")
    params = {
        "apiKey": "e1f10a1e78da46f5b10a1e78da96f525",
        "units": "e",
        "startDate": search_date
    }
    response = requests.get(api_url, params=params)
    response_data = response.json()
    if response_data['metadata']['status_code'] != 200:
        print("error response", response_data)
        # Catch case where there is no historical data for the airport
        if 'errors' in response_data and response_data['errors'][0]['error']['code'] == "NDF-0001":
            print("No historical data for airport")
            return -1, -1 
    # Find the observation closest to the desired time
    closest_observation = min(response_data["observations"], key=lambda obs: abs(obs["valid_time_gmt"] - desired_timestamp))

    # Extract wspd and precip_hrly from the closest observation
    wspd_closest = closest_observation.get("wspd")
    precip_hrly_closest = closest_observation.get("precip_hrly")
    return wspd_closest, precip_hrly_closest

In [130]:
def extract_weather_info(row_index):
    """
    Extracts and updates weather information for a specific flight.

    For a given row in the flight DataFrame, this function retrieves the departure 
    and arrival timezone for the flight's origin and destination airports. It then 
    converts the departure and arrival times to GMT and retrieves the corresponding 
    weather information (windspeed and precipitation). This information is updated 
    directly in the provided DataFrame.

    :param row_index: Index of the row in the flight DataFrame.
    :param flight_df: DataFrame containing flight information.
    """
    row = flight_df.iloc[row_index]
    flight_date, origin, dep_time, dest, arr_time = row["FlightDate"], row["Origin"], row["DepTime"], row["Dest"], row["ArrTime"]


    departure_timezone, origin_code = retrieve_timezone(origin)
    if departure_timezone is None:
        origin_windspeed = -2
        origin_precip = -2
    else:    
        dep_time_gmt = convert_to_gmt(flight_date, dep_time, departure_timezone)
        origin_windspeed, origin_precip = retrieve_weather_info(origin_code, dep_time_gmt)
    
    arrival_timezone, dest_code = retrieve_timezone(dest)
    if arrival_timezone is None:
        dest_windspeed = -2
        dest_precip = -2
    else:    
        arr_time_gmt = convert_to_gmt(flight_date, arr_time, arrival_timezone)
        dest_windspeed, dest_precip = retrieve_weather_info(dest_code, arr_time_gmt)

    flight_df.at[row_index, 'Origin_Windspeed'] = origin_windspeed
    flight_df.at[row_index, 'Origin_Precip'] = origin_precip
    flight_df.at[row_index, 'Dest_Windspeed'] = dest_windspeed
    flight_df.at[row_index, 'Dest_Precip'] = dest_precip

In [160]:
# We find the id of the first row that has no weather data, and start from there
first_none_index = flight_df['Dest_Precip'].isna().idxmax()
for row_index in range(first_none_index, first_none_index + 1000):
    print(f"updating row {row_index}")
    extract_weather_info(row_index)


updating row 353
2020-01-01 AZA 1340.0 OAK 1450.0
No Valid Airport Found
updating row 354
2020-01-12 MSN 1429.0 DTW 1649.0
updating row 355
2020-01-18 GSO 1625.0 CLT 1753.0
updating row 356
2020-01-06 PHL 838.0 DTW 1041.0
updating row 357
2020-01-21 AUS 1631.0 DFW 1755.0
updating row 358
2020-01-29 IND 631.0 CLT 838.0
updating row 359
2020-01-06 CAE 554.0 DCA 740.0
updating row 360
2020-01-18 DTW 1828.0 ITH 1940.0
updating row 361
2020-01-13 JFK 806.0 CLT 1039.0
updating row 362
2020-01-05 ATL 2105.0 MIA 2301.0
error response {'metadata': {'transaction_id': '1700006242754:1ce60f28321606eed5842d934d21125f', 'status_code': 400}, 'success': False, 'errors': [{'error': {'code': 'NDF-0001', 'message': 'There was no data found for your historical observations query.'}}]}
No historical data for airport
updating row 363
2020-01-30 SMF 1949.0 SLC 2230.0
updating row 364
2020-01-04 EWR 2256.0 MSP 47.0
updating row 365
2020-01-27 PIT 1414.0 MDW 1444.0
updating row 366
2020-01-11 MCI 1148.0 MSP 13

ValueError: time data '2020-01-13 24:00' does not match format '%Y-%m-%d %H:%M'

In [161]:
flight_df.head(10)

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,Dest,...,Distance,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,Origin_Windspeed,Origin_Precip,Dest_Windspeed,Dest_Precip
0,2020,1,1,23,4,2020-01-23,OO,N868CA,MOB,ATL,...,302.0,30.0,0.0,0.0,0.0,0.0,0,0.0,12,0.12
1,2020,1,1,4,6,2020-01-04,OO,N804SK,DTW,ORF,...,529.0,22.0,0.0,0.0,0.0,0.0,8,0.0,6,0.05
2,2020,1,1,7,2,2020-01-07,OH,N218PS,DCA,GSO,...,248.0,9.0,0.0,6.0,0.0,0.0,0,0.0,5,0.02
3,2020,1,1,18,6,2020-01-18,UA,N57864,DEN,ORD,...,888.0,0.0,0.0,1.0,0.0,16.0,8,0.0,23,0.0
4,2020,1,1,10,5,2020-01-10,OO,N268SY,RNO,SLC,...,422.0,0.0,26.0,0.0,0.0,0.0,16,0.0,17,0.0
5,2020,1,1,13,1,2020-01-13,MQ,N281NN,BTR,DFW,...,383.0,0.0,0.0,12.0,0.0,67.0,6,0.0,13,0.0
6,2020,1,1,10,5,2020-01-10,WN,N482WN,ELP,SAT,...,496.0,0.0,0.0,35.0,0.0,0.0,5,0.0,10,0.0
7,2020,1,1,16,4,2020-01-16,AA,N815AA,DFW,HNL,...,3784.0,49.0,0.0,0.0,0.0,0.0,14,0.22,3,0.0
8,2020,1,1,2,4,2020-01-02,AA,N170US,PHL,LAX,...,2402.0,9.0,0.0,4.0,0.0,50.0,17,0.0,5,0.0
9,2020,1,1,24,5,2020-01-24,OO,N877AS,SGF,IAH,...,513.0,0.0,0.0,40.0,0.0,0.0,13,0.0,0,0.0


In [162]:
output_file = "flight_data_weather.csv"
dest_filepath = "../data/"
print(f"writing to {output_file}")
flight_df.to_csv(dest_filepath + output_file, index=False)

writing to flight_data_weather.csv
