## Enrich Flight Delay Data with Weather
This only needs to be run once.
Currently the weather features I want to enrich our dataset with are:
- Origin_Windspeed: wind (mph) at time of departure (from departing Airport)
- Origin_Precip: precipitation (inches) at time of departure (from departing Airport)
- Dest_Windspeed: wind (mph) at time of arrival (from arriving Airport)
- Dest_Precip: precipitation (inches) at time of arrival (from arriving Airport)

This is code is currently unoptimized. But for POC, I extracted 1k rows.

In [24]:
import pandas as pd
import pytz
from datetime import datetime, timedelta
import requests

filename = "flight_data_weather.csv"
filepath = "../data/"

In [25]:
flight_df = pd.read_csv(filepath + filename)

In [26]:
flight_df.describe()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,Flight_Number_Reporting_Airline,DepTime,DepDelay,TaxiOut,WheelsOff,...,AirTime,Flights,Distance,Full-time,Part-time,Grand Total,Origin_Windspeed,Origin_Precip,Dest_Windspeed,Dest_Precip
count,346694.0,346694.0,346694.0,346694.0,346694.0,346694.0,346694.0,346694.0,346694.0,346694.0,...,346694.0,346694.0,346694.0,345448.0,345448.0,345448.0,8743.0,8760.0,8743.0,8760.0
mean,2020.999224,2.500935,6.502293,15.733405,4.036551,2590.339063,1385.94438,28.717512,17.974845,1409.635736,...,112.788655,1.0,802.699842,41374.780517,3931.688518,45306.469034,8.858515,0.003453,8.775134,0.002687
std,0.817125,1.117914,3.45174,8.740893,2.009212,1773.885633,492.125047,74.486968,11.906633,494.570132,...,67.26825,0.0,558.080057,32795.894987,4395.508628,36010.24228,5.931952,0.051514,5.928036,0.048114
min,2020.0,1.0,1.0,1.0,1.0,1.0,1.0,-62.0,1.0,1.0,...,11.0,1.0,29.0,2357.0,0.0,2374.0,-2.0,-2.0,-2.0,-2.0
25%,2020.0,2.0,4.0,8.0,2.0,1087.0,1003.0,-4.0,11.0,1020.0,...,63.0,1.0,394.0,11030.0,740.0,13191.0,5.0,0.0,5.0,0.0
50%,2021.0,3.0,7.0,16.0,4.0,2221.0,1411.0,2.0,14.0,1425.0,...,98.0,1.0,674.0,53269.0,1818.0,54748.0,8.0,0.0,8.0,0.0
75%,2022.0,4.0,10.0,23.0,6.0,3971.0,1805.0,37.0,20.0,1822.0,...,143.0,1.0,1036.0,65881.0,5655.0,72639.0,13.0,0.0,13.0,0.0
max,2022.0,4.0,12.0,31.0,7.0,8819.0,2400.0,2175.0,218.0,2400.0,...,677.0,1.0,5095.0,97373.0,16424.0,109108.0,43.0,0.83,38.0,0.89


In [27]:
# new_columns = ["Origin_Windspeed", "Origin_Precip", "Dest_Windspeed", "Dest_Precip"]
# if set(new_columns).issubset(flight_df.columns):
#     print("Weather data already added")
# else:
#     # New Columns
#     flight_df["Origin_Windspeed"] = None
#     flight_df["Origin_Precip"] = None
#     flight_df["Dest_Windspeed"] = None
#     flight_df["Dest_Precip"] = None

In [28]:

airport_url = "https://api.weather.com/v3/location/search"

def retrieve_timezone(airport_code):
    """
    Retrieves the timezone and ICAO code for a given airport code.

    :param airport_code: The IATA or ICAO airport code.
    :return: A tuple containing the IANA timezone and ICAO code of the airport, or (None, None) if not found.
    """
    params = {
        "apiKey": "e1f10a1e78da46f5b10a1e78da96f525",
        "language": "en-US",
        "locationType": "airport",
        "format": "json",
        "countryCode": "US",
        "query": airport_code
    }
    try:
        response = requests.get(airport_url, params=params)
        response.raise_for_status()  # Raises HTTPError for bad responses
        response_data = response.json()

        if 'errors' in response_data:
            print("Error: ", response_data['errors'][0]['error']['message'])
            return None, None

        for i, loc_type in enumerate(response_data['location']['type']):
            if (loc_type == 'airport' and 
                response_data['location']['icaoCode'][i] is not None and
                airport_code in response_data['location']['icaoCode'][i]):
                local_timezone = response_data['location']['ianaTimeZone'][i]
                icao_code = response_data['location']['icaoCode'][i]
                return local_timezone, icao_code
        print("No Valid Airport Found")
        return None, None
    except requests.RequestException as e:
        print(f"Request error: {e}")
        return None, None

In [29]:
def convert_to_gmt(flight_date, local_time, local_timezone):
    """
    Convert local time to GMT.

    :param flight_date: Date of the flight in 'YYYY-MM-DD' format.
    :param local_time: Time of the flight in 'HHMM' format.
    :param local_timezone: Timezone of the local time.
    :return: GMT Unix timestamp of the flight.
    """
    # Format local time into 'HH:MM' format
    str_local_time = str(int(local_time))
    if(len(str_local_time) < 4):
        str_local_time = "0" * (4 - len(str_local_time)) + str_local_time

    if str_local_time.startswith("24"):
        # Set time to '00:00'
        formatted_local_time = "00:00"
        # Increment the date by one day
        flight_date_obj = datetime.strptime(flight_date, "%Y-%m-%d")
        flight_date_obj += timedelta(days=1)
        flight_date = flight_date_obj.strftime("%Y-%m-%d")
    else:
        formatted_local_time = str_local_time[0:2] + ":" + str_local_time[2:4]

    # Combine flight date and time
    combined_datetime_str = f"{flight_date} {formatted_local_time}"

    # Convert to local datetime object
    local_datetime = datetime.strptime(combined_datetime_str, "%Y-%m-%d %H:%M")

    # Assign local timezone
    local_timezone = pytz.timezone(local_timezone)
    local_datetime_with_tz = local_timezone.localize(local_datetime)

    # Convert to GMT
    gmt_datetime = local_datetime_with_tz.astimezone(pytz.utc)

    return int(gmt_datetime.timestamp())

In [42]:
from requests.exceptions import ReadTimeout


In [43]:
def retrieve_weather_info(airport_code, desired_timestamp):
    api_url = f"https://api.weather.com/v1/location/{airport_code}:9:US/observations/historical.json"

    # Convert Unix timestamp to datetime
    dt = datetime.utcfromtimestamp(desired_timestamp)
    # Format datetime to "YYYYMMDD"
    search_date = dt.strftime("%Y%m%d")
    params = {
        "apiKey": "e1f10a1e78da46f5b10a1e78da96f525",
        "units": "e",
        "startDate": search_date
    }
    for _ in range(3):  # Number of retries
        try:
            response = requests.get(api_url, params=params, timeout=30)
            # Process response
            break  # Break the loop if request is successful
        except ReadTimeout:
            print("Request timed out. Retrying...")
            return -3, -3

    response_data = response.json()
    if response_data['metadata']['status_code'] != 200:
        print("error response", response_data)
        # Catch case where there is no historical data for the airport
        if 'errors' in response_data and response_data['errors'][0]['error']['code'] == "NDF-0001":
            print("No historical data for airport")
            return -1, -1 
        else:
            return -2, -2
    # Find the observation closest to the desired time
    closest_observation = min(response_data["observations"], key=lambda obs: abs(obs["valid_time_gmt"] - desired_timestamp))

    # Extract wspd and precip_hrly from the closest observation
    wspd_closest = closest_observation.get("wspd")
    precip_hrly_closest = closest_observation.get("precip_hrly")
    return wspd_closest, precip_hrly_closest

In [31]:
def extract_weather_info(row_index):
    """
    Extracts and updates weather information for a specific flight.

    For a given row in the flight DataFrame, this function retrieves the departure 
    and arrival timezone for the flight's origin and destination airports. It then 
    converts the departure and arrival times to GMT and retrieves the corresponding 
    weather information (windspeed and precipitation). This information is updated 
    directly in the provided DataFrame.

    :param row_index: Index of the row in the flight DataFrame.
    :param flight_df: DataFrame containing flight information.
    """
    row = flight_df.iloc[row_index]
    flight_date, dep_time, arr_time = row["FlightDate"], row["DepTime"], row["ArrTime"]
    
    # Airport timezone and ICAO code
    origin_timezone, origin_code = row["origin_ianaTimeZone"], row["origin_icaoCode"]
    dest_timezone, dest_code = row["dest_ianaTimeZone"], row["dest_icaoCode"]

    dep_time_gmt = convert_to_gmt(flight_date, dep_time, origin_timezone)
    origin_windspeed, origin_precip = retrieve_weather_info(origin_code, dep_time_gmt)
    
    arr_time_gmt = convert_to_gmt(flight_date, arr_time, dest_timezone)
    dest_windspeed, dest_precip = retrieve_weather_info(dest_code, arr_time_gmt)

    flight_df.at[row_index, 'Origin_Windspeed'] = origin_windspeed
    flight_df.at[row_index, 'Origin_Precip'] = origin_precip
    flight_df.at[row_index, 'Dest_Windspeed'] = dest_windspeed
    flight_df.at[row_index, 'Dest_Precip'] = dest_precip

In [32]:
# # Instead of querying the API for each row, Just query the unique timezone for each airport and add it to the dataframe
# flight_df["dest_icaoCode"] = None
# flight_df["dest_ianaTimeZone"] = None

# flight_df["origin_icaoCode"] = None
# flight_df["origin_ianaTimeZone"] = None

# unique_dest = set(flight_df['Dest'].unique())
# unique_origin = set(flight_df['Origin'].unique())

# unique_airports = unique_dest.union(unique_origin)

In [33]:
# # Lookup timezone and ICAO code for each airport 
# for airport_code in unique_airports:
#     ianaTimeZone, icaoCode = retrieve_timezone(airport_code)
#     # Set icaoCode where flight_df matches airport_code
#     flight_df.loc[flight_df['Dest'] == airport_code, 'dest_icaoCode'] = icaoCode
#     flight_df.loc[flight_df['Origin'] == airport_code, 'origin_icaoCode'] = icaoCode
#     # Set icaoTimeZone where flight_df matches airport_code
#     flight_df.loc[flight_df['Dest'] == airport_code, 'dest_ianaTimeZone'] = ianaTimeZone
#     flight_df.loc[flight_df['Origin'] == airport_code, 'origin_ianaTimeZone'] = ianaTimeZone

In [41]:
import time
from requests.exceptions import ReadTimeout

In [39]:
def save_output(df):
    output_file = "flight_data_weather.csv"
    dest_filepath = "../data/"
    print(f"writing to {output_file}")
    df.to_csv(dest_filepath + output_file, index=False)
    print("saving at time: ", datetime.now().strftime("%H:%M:%S"))

In [44]:
# We find the id of the first row that has no weather data, and start from there
first_none_index = flight_df['Dest_Precip'].isna().idxmax()
while True and first_none_index < (len(flight_df) - 1):
    first_none_index = flight_df['Dest_Precip'].isna().idxmax()
    print(f"updating row {first_none_index}")
    for row_index in range(first_none_index, first_none_index + 1000):
        extract_weather_info(row_index)
    save_output(flight_df)
    time.sleep(10) 



updating row 12835
error response {'metadata': {'transaction_id': '1700311840389:1375199f7795709160904f645ee7e2b8', 'status_code': 400}, 'success': False, 'errors': [{'error': {'code': 'NDF-0001', 'message': 'There was no data found for your historical observations query.'}}]}
No historical data for airport
error response {'metadata': {'transaction_id': '1700311920647:152d25b4f5c26f53d8c76c34f4cc94f0', 'status_code': 400}, 'success': False, 'errors': [{'error': {'code': 'NDF-0001', 'message': 'There was no data found for your historical observations query.'}}]}
No historical data for airport
writing to flight_data_weather.csv
saving at time:  04:53:57
updating row 13835
writing to flight_data_weather.csv
saving at time:  04:57:55
updating row 14835
error response {'metadata': {'transaction_id': '1700312493264:c3a63af1abf43f8c9be2933259648a60', 'status_code': 400}, 'success': False, 'errors': [{'error': {'code': 'ILA-0001', 'message': 'The location supplied is invalid.'}}]}
writing to f

KeyboardInterrupt: 

In [53]:
flight_df.head()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Flight_Number_Reporting_Airline,Origin,...,Part-time,Grand Total,Origin_Windspeed,Origin_Precip,Dest_Windspeed,Dest_Precip,dest_ianaTimeZone,dest_icaoCode,origin_ianaTimeZone,origin_icaoCode
0,2020.0,1.0,1.0,23.0,4.0,2020-01-23,OO,N868CA,3745.0,MOB,...,1910.0,14646.0,0,0,15,0.15,America/New_York,KATL,America/Chicago,KMOB
1,2020.0,1.0,1.0,4.0,6.0,2020-01-04,OO,N804SK,3905.0,DTW,...,1910.0,14646.0,8,0,6,0.05,America/New_York,KORF,America/Detroit,KDTW
2,2020.0,1.0,1.0,7.0,2.0,2020-01-07,OH,N218PS,5242.0,DCA,...,8.0,4766.0,0,0,5,0.02,America/New_York,KGSO,America/New_York,KDCA
3,2020.0,1.0,1.0,18.0,6.0,2020-01-18,UA,N57864,502.0,DEN,...,11892.0,92037.0,8,0,23,0.0,America/Chicago,KORD,America/Denver,KDEN
4,2020.0,1.0,1.0,10.0,5.0,2020-01-10,OO,N268SY,3666.0,RNO,...,1910.0,14646.0,16,0,17,0.0,America/Denver,KSLC,America/Los_Angeles,KRNO
