## Enrich Flight Delay Data with Weather
This only needs to be run once.
Currently the weather features I want to enrich our dataset with are:
- Origin_Windspeed: wind (mph) at time of departure (from departing Airport)
- Origin_Precip: precipitation (inches) at time of departure (from departing Airport)
- Dest_Windspeed: wind (mph) at time of arrival (from arriving Airport)
- Dest_Precip: precipitation (inches) at time of arrival (from arriving Airport)

In addition, I added the time zone of the departing and arriving airports, which could a be a useful feature for future analysis.
- dest_ianaTimeZone: IANA time zone name for the destination airport
- origin_ianaTimeZone: IANA time zone name for the origin airport

From starting flight_data.csv of 360k, I ended with 343120 rows of data after merging with weather data. 
I lost some due to missing weather data, and some due to missing flight data. Some airports weren't in the states or Weather API. 

In [None]:
import pandas as pd
import pytz
from datetime import datetime, timedelta
from requests.exceptions import ReadTimeout
import requests
import time

filename = "flight_data_weather.csv"
filepath = "../data/"

In [None]:
flight_df = pd.read_csv(filepath + filename)

In [None]:
flight_df.describe()

In [None]:
# new_columns = ["Origin_Windspeed", "Origin_Precip", "Dest_Windspeed", "Dest_Precip"]
# if set(new_columns).issubset(flight_df.columns):
#     print("Weather data already added")
# else:
#     # New Columns
#     flight_df["Origin_Windspeed"] = None
#     flight_df["Origin_Precip"] = None
#     flight_df["Dest_Windspeed"] = None
#     flight_df["Dest_Precip"] = None

In [None]:

airport_url = "https://api.weather.com/v3/location/search"

def retrieve_timezone(airport_code):
    """
    Retrieves the timezone and ICAO code for a given airport code.

    :param airport_code: The IATA or ICAO airport code.
    :return: A tuple containing the IANA timezone and ICAO code of the airport, or (None, None) if not found.
    """
    params = {
        "apiKey": "e1f10a1e78da46f5b10a1e78da96f525",
        "language": "en-US",
        "locationType": "airport",
        "format": "json",
        "countryCode": "US",
        "query": airport_code
    }
    try:
        response = requests.get(airport_url, params=params)
        response.raise_for_status()  # Raises HTTPError for bad responses
        response_data = response.json()

        if 'errors' in response_data:
            print("Error: ", response_data['errors'][0]['error']['message'])
            return None, None

        for i, loc_type in enumerate(response_data['location']['type']):
            if (loc_type == 'airport' and 
                response_data['location']['icaoCode'][i] is not None and
                airport_code in response_data['location']['icaoCode'][i]):
                local_timezone = response_data['location']['ianaTimeZone'][i]
                icao_code = response_data['location']['icaoCode'][i]
                return local_timezone, icao_code
        print("No Valid Airport Found")
        return None, None
    except requests.RequestException as e:
        print(f"Request error: {e}")
        return None, None

In [None]:
def convert_to_gmt(flight_date, local_time, local_timezone):
    """
    Convert local time to GMT.

    :param flight_date: Date of the flight in 'YYYY-MM-DD' format.
    :param local_time: Time of the flight in 'HHMM' format.
    :param local_timezone: Timezone of the local time.
    :return: GMT Unix timestamp of the flight.
    """
    # Format local time into 'HH:MM' format
    str_local_time = str(int(local_time))
    if(len(str_local_time) < 4):
        str_local_time = "0" * (4 - len(str_local_time)) + str_local_time

    if str_local_time.startswith("24"):
        # Set time to '00:00'
        formatted_local_time = "00:00"
        # Increment the date by one day
        flight_date_obj = datetime.strptime(flight_date, "%Y-%m-%d")
        flight_date_obj += timedelta(days=1)
        flight_date = flight_date_obj.strftime("%Y-%m-%d")
    else:
        formatted_local_time = str_local_time[0:2] + ":" + str_local_time[2:4]

    # Combine flight date and time
    combined_datetime_str = f"{flight_date} {formatted_local_time}"

    # Convert to local datetime object
    local_datetime = datetime.strptime(combined_datetime_str, "%Y-%m-%d %H:%M")

    # Assign local timezone
    local_timezone = pytz.timezone(local_timezone)
    local_datetime_with_tz = local_timezone.localize(local_datetime)

    # Convert to GMT
    gmt_datetime = local_datetime_with_tz.astimezone(pytz.utc)

    return int(gmt_datetime.timestamp())

In [None]:
def retrieve_weather_info(airport_code, desired_timestamp):
    api_url = f"https://api.weather.com/v1/location/{airport_code}:9:US/observations/historical.json"

    # Convert Unix timestamp to datetime
    dt = datetime.utcfromtimestamp(desired_timestamp)
    # Format datetime to "YYYYMMDD"
    search_date = dt.strftime("%Y%m%d")
    params = {
        "apiKey": "e1f10a1e78da46f5b10a1e78da96f525",
        "units": "e",
        "startDate": search_date
    }
    for _ in range(3):  # Number of retries
        try:
            response = requests.get(api_url, params=params, timeout=30)
            # Process response
            break  # Break the loop if request is successful
        except ReadTimeout:
            print("Request timed out. Retrying...")
            return -3, -3

    response_data = response.json()
    if response_data['metadata']['status_code'] != 200:
        print("error response", response_data)
        # Catch case where there is no historical data for the airport
        if 'errors' in response_data and response_data['errors'][0]['error']['code'] == "NDF-0001":
            print("No historical data for airport")
            return -1, -1 
        else:
            return -2, -2
    # Find the observation closest to the desired time
    closest_observation = min(response_data["observations"], key=lambda obs: abs(obs["valid_time_gmt"] - desired_timestamp))

    # Extract wspd and precip_hrly from the closest observation
    wspd_closest = closest_observation.get("wspd")
    precip_hrly_closest = closest_observation.get("precip_hrly")
    return wspd_closest, precip_hrly_closest

In [None]:
def extract_weather_info(row_index):
    """
    Extracts and updates weather information for a specific flight.

    For a given row in the flight DataFrame, this function retrieves the departure 
    and arrival timezone for the flight's origin and destination airports. It then 
    converts the departure and arrival times to GMT and retrieves the corresponding 
    weather information (windspeed and precipitation). This information is updated 
    directly in the provided DataFrame.

    :param row_index: Index of the row in the flight DataFrame.
    :param flight_df: DataFrame containing flight information.
    """
    row = flight_df.iloc[row_index]
    flight_date, dep_time, arr_time = row["FlightDate"], row["DepTime"], row["ArrTime"]
    
    # Airport timezone and ICAO code
    origin_timezone, origin_code = row["origin_ianaTimeZone"], row["origin_icaoCode"]
    dest_timezone, dest_code = row["dest_ianaTimeZone"], row["dest_icaoCode"]

    dep_time_gmt = convert_to_gmt(flight_date, dep_time, origin_timezone)
    origin_windspeed, origin_precip = retrieve_weather_info(origin_code, dep_time_gmt)
    
    arr_time_gmt = convert_to_gmt(flight_date, arr_time, dest_timezone)
    dest_windspeed, dest_precip = retrieve_weather_info(dest_code, arr_time_gmt)

    flight_df.at[row_index, 'Origin_Windspeed'] = origin_windspeed
    flight_df.at[row_index, 'Origin_Precip'] = origin_precip
    flight_df.at[row_index, 'Dest_Windspeed'] = dest_windspeed
    flight_df.at[row_index, 'Dest_Precip'] = dest_precip

In [None]:
# # Instead of querying the API for each row, Just query the unique timezone for each airport and add it to the dataframe
# flight_df["dest_icaoCode"] = None
# flight_df["dest_ianaTimeZone"] = None

# flight_df["origin_icaoCode"] = None
# flight_df["origin_ianaTimeZone"] = None

# unique_dest = set(flight_df['Dest'].unique())
# unique_origin = set(flight_df['Origin'].unique())

# unique_airports = unique_dest.union(unique_origin)

In [None]:
# # Lookup timezone and ICAO code for each airport 
# for airport_code in unique_airports:
#     ianaTimeZone, icaoCode = retrieve_timezone(airport_code)
#     # Set icaoCode where flight_df matches airport_code
#     flight_df.loc[flight_df['Dest'] == airport_code, 'dest_icaoCode'] = icaoCode
#     flight_df.loc[flight_df['Origin'] == airport_code, 'origin_icaoCode'] = icaoCode
#     # Set icaoTimeZone where flight_df matches airport_code
#     flight_df.loc[flight_df['Dest'] == airport_code, 'dest_ianaTimeZone'] = ianaTimeZone
#     flight_df.loc[flight_df['Origin'] == airport_code, 'origin_ianaTimeZone'] = ianaTimeZone

In [None]:
def save_output(df):
    output_file = "flight_data_weather.csv"
    dest_filepath = "../data/"
    print(f"writing to {output_file}")
    df.to_csv(dest_filepath + output_file, index=False)
    print("saving at time: ", datetime.now().strftime("%H:%M:%S"))

In [None]:
# We find the id of the first row that has no weather data, and start from there
first_none_index = flight_df['Dest_Precip'].isna().idxmax()
while True and first_none_index < (len(flight_df) - 1):
    first_none_index = flight_df['Dest_Precip'].isna().idxmax()
    print(f"updating row {first_none_index}")
    for row_index in range(first_none_index, first_none_index + 1000):
        extract_weather_info(row_index)
    save_output(flight_df)
    time.sleep(1) 



In [None]:
# Find rows with invalid windspeeds, this is usually due to invalid API queries:
#  e.g. no historical data for the airport. We lost about 2% of the rows to this.
invalid_rows = flight_df[flight_df.Origin_Windspeed < 0]
len(invalid_rows)
trimmed_df = flight_df[flight_df.Origin_Windspeed >= 0]
trimmed_df.describe()

In [None]:
trimmed_df.columns
trimmed_count_df = trimmed_df.dropna()

In [None]:
trimmed_count_df['Aircraft_Daily_Flight_Count'] = None 

trimmed_count_df.sort_values(by=['FlightDate', 'DepTime'], inplace=True)

# Group by 'flightdate' and 'Tail_Number' and use cumcount() to get a count within each group
trimmed_count_df['Aircraft_Daily_Flight_Count'] = trimmed_count_df.groupby(['FlightDate', 'Tail_Number']).cumcount() + 1

In [None]:
trimmed_count_df = trimmed_count_df.drop(columns=['dest_icaoCode', 'origin_icaoCode', 'Flights'], axis=1)

In [None]:
save_output(trimmed_count_df)