# Imports

In [1]:
import pandas as pd
import time
import requests
import json
import numpy as np

# Load data

In [2]:
df_unprocessed = pd.read_csv('df_cleaned_non_cancelled.csv', parse_dates=['DepDateTime', 'ArrDateTime', 'CRSArrDateTime', 'CRSDepDateTime'], date_format='%d-%m-%Y %H%M')
df_unprocessed.drop(columns=['Unnamed: 0'], inplace=True)

# Add Airport Coord + Region

In [16]:
airport_coord_file = open("airport_coord.json")

coord_data = json.load(airport_coord_file)

# for airport in cleaned_df["Origin"]:
#     for code, info in coord_data.items():
#         if code == airport:
#             cleaned_df["Lat"] = info.get("latitude")
#             cleaned_df["Long"] = info.get("longitude")
#             cleaned_df["Region"] = info.get("region")

def add_airport_info(cleaned_df, coord_data):
    airport_info_by_code = {code: info for code, info in coord_data.items()}

    if pd.api.types.is_string_dtype(cleaned_df['Origin']):
        mask = cleaned_df['Origin'].isin(airport_info_by_code.keys())
        def get_airport_info(airport):
            info = airport_info_by_code.get(airport)
            return info if info is not None else {'latitude': None, 'longitude': None, 'region': None}
        cleaned_df.loc[mask, 'Lat'] = cleaned_df[mask]['Origin'].apply(get_airport_info).apply(lambda x: x['latitude'])
        cleaned_df.loc[mask, 'Long'] = cleaned_df[mask]['Origin'].apply(get_airport_info).apply(lambda x: x['longitude'])
        cleaned_df.loc[mask, 'Region'] = cleaned_df[mask]['Origin'].apply(get_airport_info).apply(lambda x: x['region'])
    else:
        for i, airport in cleaned_df['Origin'].items():
          if airport in airport_info_by_code:
            cleaned_df.loc[i, 'Lat'] = airport_info_by_code[airport].get('latitude')
            cleaned_df.loc[i, 'Long'] = airport_info_by_code[airport].get('longitude')
        cleaned_df.loc[i, 'Region'] = airport_info_by_code[airport].get('region')

    return cleaned_df

cleaned_df = add_airport_info(df_unprocessed.copy(), coord_data.copy())
print(cleaned_df)

airport_coord_file.close()

         TotalDelayDuration  ActualElapsedTime  AirTime  ArrDelay  ArrTime  \
0                       9.0              154.0    122.0      90.0   1850.0   
1                      -1.0              293.0    272.0      -1.0   1543.0   
2                       1.0              121.0    101.0      -6.0    809.0   
3                       6.0              162.0    142.0       5.0   1001.0   
4                     -21.0              113.0     92.0     -25.0   1534.0   
...                     ...                ...      ...       ...      ...   
2681775                 7.0              127.0    111.0      14.0   1624.0   
2681776                12.0              228.0    205.0      20.0   1651.0   
2681777               -19.0              183.0    164.0     -23.0   1604.0   
2681778                 1.0              124.0     98.0      -6.0   1042.0   
2681779                 7.0              157.0    124.0      18.0      3.0   

         CRSArrTime  CRSDepTime  CRSElapsedTime  CancellationCo

In [18]:
cleaned_df.iloc[1]

TotalDelayDuration                   -1.0
ActualElapsedTime                   293.0
AirTime                             272.0
ArrDelay                             -1.0
ArrTime                            1543.0
CRSArrTime                           1544
CRSDepTime                            750
CRSElapsedTime                      294.0
CancellationCode                      NaN
Cancelled                               0
CarrierDelay                          0.0
DayOfWeek                               2
DayOfMonthDep                           8
DepDelay                              0.0
DepTime                             750.0
Dest                                  IAD
Distance                           2288.0
Diverted                                0
FlightNum                             946
LateAircraftDelay                     0.0
MonthDep                                5
NASDelay                              0.0
Origin                                LAX
SecurityDelay                     

In [None]:
df = cleaned_df.copy()

# Airport Coord

In [None]:
import requests

header = {"X-Api-Key":"lf/2O0WcstK5VeRxvvASdA==tinCbr385akwLFVh"}

airport_coord = {}
list_failed = []

for iata in list(df.Origin.unique()):
    url = f'https://api.api-ninjas.com/v1/airports?iata={iata}'

    data = requests.get(url, headers=header).json()
    try:
        lat = data[0]['latitude']
        long = data[0]['longitude']
        region = data[0]['region']
        airport_coord[iata] = {
            'latitude' : lat,
            'longitude' : long,
            'region' : region
        }
    except:
        print(f"iata : {iata}")
        list_failed.append(iata)
airport_coord

In [None]:
list_failed

In [None]:
code = 'MTH'

airport_coord[code] = {
    'latitude': '24.722330444',
    'longitude': '-81.05083313',
    'region': 'Florida'
}

In [None]:
for i in list(df.Origin.unique()):
    if i not in airport_coord:
        print(i)

In [None]:
airport_coord

In [None]:
import json

with open('airport_coord.txt', 'w') as f:
    f.write(json.dumps(airport_coord))

# Weather

2003 - 2008

In [None]:
# {
#     'airport': [
#         'date' : {
#             'weather_code'
#             'precipitation'
#             'snowfall'
#             'windspeed'
#         }
#     ]
# }

In [None]:
import json

with open('airport_coord.txt', 'r') as f:
    airport_coord = json.load(f)

In [None]:
import time

airport_weather_dict = {}

for airport in airport_coord:
#     if airport not in list_airports_remaining:
#         continue
    airport_weather_dict[airport] = {}
    
    lat = airport_coord[airport]['latitude']
    long = airport_coord[airport]['longitude']
    print(f"Getting data for {airport} at {lat}, {long}...")
    url = f'https://archive-api.open-meteo.com/v1/archive?start_date=2003-01-01&end_date=2009-01-01&latitude={lat}&longitude={long}&daily=weather_code,precipitation_hours,snowfall_sum,wind_speed_10m_max'
    
    try:
        data = requests.get(url).json()
        daily = data['daily']
    except:
        try:
            print("\n\nAPI limit hit...sleeping for 1 min...\n\n")
            time.sleep(65)
            data = requests.get(url).json()
        except:
            print("\n\nAPI limit hit...sleeping for 1 hour...\n\n")
            time.sleep(3600)
            data = requests.get(url).json()
    
    for i in range(len(data['daily']['time'])):
        date = data['daily']['time'][i]
        weather_code = data['daily']['weather_code'][i]
        precipitation_hours = data['daily']['precipitation_hours'][i]
        snowfall_sum = data['daily']['snowfall_sum'][i]
        wind_speed = data['daily']['wind_speed_10m_max'][i]
        airport_weather_dict[airport][date] = {
            'weather_code' : weather_code,
            'precipitation_hours': precipitation_hours,
            'snowfall_sum' :snowfall_sum,
            'wind_speed' : wind_speed
        }

In [None]:
len(airport_weather_dict)

In [None]:
list_airports = list(df.Origin.unique())
i = list_airports.index('PWM')
list_airports_remaining = list_airports[i:]
len(list_airports_remaining)

The API Limit was hit...

In [None]:
for airport in airport_coord:
    if airport not in list_airports_remaining:
        continue
    airport_weather_dict[airport] = {}
    
    lat = airport_coord[airport]['latitude']
    long = airport_coord[airport]['longitude']
    print(f"Getting data for {airport} at {lat}, {long}...")
    url = f'https://archive-api.open-meteo.com/v1/archive?start_date=2003-01-01&end_date=2009-01-01&latitude={lat}&longitude={long}&daily=weather_code,precipitation_hours,snowfall_sum,wind_speed_10m_max'
    
    try:
        data = requests.get(url).json()
        daily = data['daily']
    except:
        try:
            print("\n\nAPI limit hit...sleeping for 1 min...\n\n")
            time.sleep(65)
            data = requests.get(url).json()
        except:
            print("\n\nAPI limit hit...sleeping for 1 hour...\n\n")
            time.sleep(3600)
            data = requests.get(url).json()
    
    for i in range(len(data['daily']['time'])):
        date = data['daily']['time'][i]
        weather_code = data['daily']['weather_code'][i]
        precipitation_hours = data['daily']['precipitation_hours'][i]
        snowfall_sum = data['daily']['snowfall_sum'][i]
        wind_speed = data['daily']['wind_speed_10m_max'][i]
        airport_weather_dict[airport][date] = {
            'weather_code' : weather_code,
            'precipitation_hours': precipitation_hours,
            'snowfall_sum' :snowfall_sum,
            'wind_speed' : wind_speed
        }

In [None]:
import json

with open('airport_weather_final.json', 'w') as f:
    json.dump(airport_weather_dict, f)

# Map weather to flights

In [4]:
df_weather = df.drop(columns=[
    'TotalDelayDuration',
    'ArrTime',
    'DepTime',
    'AirTime',
    'ArrDelay',
    'CRSArrTime',
    'ActualElapsedTime',
    'CRSElapsedTime',
    'Cancelled',
    'CarrierDelay',
    'DayOfWeek',
    'Dest',
    'Diverted',
    'FlightNum',
    'LateAircraftDelay',
    'NASDelay',
    'SecurityDelay',
    'TailNum',
    'TaxiIn',
    'UniqueCarrier',
    'WeatherDelay',
    'MonthArr',
    'YearArr',
    'DepDateTime',
    'ArrDateTime',
    'CRSArrDateTime',
#    'CRSDepDateTime',
    'CRSDayOfMonthArr',
    'CRSMonthArr',
    'CRSYearArr',
    'CancellationCode'
])

target = df_weather['DepDelay']

In [None]:
df.iloc[1]

In [5]:
df_weather['weather_code'] = ''
df_weather['precipitation_hours'] = ''
df_weather['snowfall_sum'] = ''
df_weather['wind_speed'] = ''

In [6]:
df_weather.iloc[1]

# Drop types of delays -> focus only on dep delay
# Drop all date time columns -> Delay unlikely to be affected
# Drop tail num, Taxi in, Flight num, Dest, Diverted

CRSDepTime                             750
DayOfMonthDep                            8
DepDelay                               0.0
Distance                            2288.0
MonthDep                                 5
Origin                                 LAX
TaxiOut                               13.0
YearDep                               2007
DayOfMonthArr                            8
CRSDayOfMonthDep                         8
CRSMonthDep                              5
CRSYearDep                            2007
CRSDepDateTime         2007-05-08 07:50:00
weather_code                              
precipitation_hours                       
snowfall_sum                              
wind_speed                                
Name: 1, dtype: object

In [7]:
str(df_weather.iloc[1]['CRSDepDateTime'].date())

'2007-05-08'

In [8]:
with open('airport_weather_final.json', 'r') as f:
    airport_weather_dict = json.load(f)

In [None]:
airport_weather_dict['ORD']

In [11]:
def add_weather_code(airport_weather_dict, Origin, CRSDepDateTime, weather_code):
    if weather_code != '':
        return weather_code
    else:
        list_weathers = airport_weather_dict[Origin]
        dep_date = str(CRSDepDateTime.date())
        
        for weather_dict in list_weathers:
            if dep_date in weather_dict:
                weather_code = weather_dict[dep_date]['weather_code']

        return weather_code

start_time = time.time()
v_add_weather = np.vectorize(add_weather_code, otypes=[object])

df_weather['weather_code'] = v_add_weather(airport_weather_dict, df_weather['Origin'], df_weather['CRSDepDateTime'], df_weather['weather_code'])
print(f"TOTAL TIME TAKEN: {(time.time() - start_time)}")

TOTAL TIME TAKEN: 461.5535967350006


In [12]:
def add_precipitation_hours(airport_weather_dict, Origin, CRSDepDateTime, precipitation_hours):
    if precipitation_hours != '':
        return precipitation_hours
    else:
        list_weathers = airport_weather_dict[Origin]
        dep_date = str(CRSDepDateTime.date())
        
        for weather_dict in list_weathers:
            if dep_date in weather_dict:
                precipitation_hours = weather_dict[dep_date]['precipitation_hours']
    
        return precipitation_hours

start_time = time.time()
v_add_weather = np.vectorize(add_precipitation_hours, otypes=[object])

df_weather['precipitation_hours'] = v_add_weather(airport_weather_dict, df_weather['Origin'], df_weather['CRSDepDateTime'], df_weather['precipitation_hours'])
print(f"TOTAL TIME TAKEN: {(time.time() - start_time)}")

TOTAL TIME TAKEN: 490.17233967781067


In [13]:
def add_snowfall_sum(airport_weather_dict, Origin, CRSDepDateTime, snowfall_sum):
    if snowfall_sum != '':
        return snowfall_sum
    else:
        list_weathers = airport_weather_dict[Origin]
        dep_date = str(CRSDepDateTime.date())
        
        for weather_dict in list_weathers:
            if dep_date in weather_dict:
                snowfall_sum = weather_dict[dep_date]['snowfall_sum']
    
        return snowfall_sum

start_time = time.time()
v_add_weather = np.vectorize(add_snowfall_sum, otypes=[object])

df_weather['snowfall_sum'] = v_add_weather(airport_weather_dict, df_weather['Origin'], df_weather['CRSDepDateTime'], df_weather['snowfall_sum'])
print(f"TOTAL TIME TAKEN: {(time.time() - start_time)}")

TOTAL TIME TAKEN: 481.87537455558777


In [14]:
def add_wind_speed(airport_weather_dict, Origin, CRSDepDateTime, wind_speed):
    if wind_speed != '':
        return wind_speed
    else:
        list_weathers = airport_weather_dict[Origin]
        dep_date = str(CRSDepDateTime.date())
        
        for weather_dict in list_weathers:
            if dep_date in weather_dict:
                wind_speed = weather_dict[dep_date]['wind_speed']
    
        return wind_speed

start_time = time.time()
v_add_weather = np.vectorize(add_wind_speed, otypes=[object])
df_weather['wind_speed'] = v_add_weather(airport_weather_dict, df_weather['Origin'], df_weather['CRSDepDateTime'], df_weather['wind_speed'])
print(f"TOTAL TIME TAKEN: {(time.time() - start_time)}")

TOTAL TIME TAKEN: 515.2350189685822


In [15]:
df_weather.iloc[1]

CRSDepTime                             750
DayOfMonthDep                            8
DepDelay                               0.0
Distance                            2288.0
MonthDep                                 5
Origin                                 LAX
TaxiOut                               13.0
YearDep                               2007
DayOfMonthArr                            8
CRSDayOfMonthDep                         8
CRSMonthDep                              5
CRSYearDep                            2007
CRSDepDateTime         2007-05-08 07:50:00
weather_code                             0
precipitation_hours                    0.0
snowfall_sum                           0.0
wind_speed                            27.4
Name: 1, dtype: object

In [29]:
df_weather

Unnamed: 0,CRSDepTime,DayOfMonthDep,DepDelay,Distance,MonthDep,Origin,TaxiOut,YearDep,DayOfMonthArr,CRSDayOfMonthDep,CRSMonthDep,CRSYearDep,CRSDepDateTime,weather_code,precipitation_hours,snowfall_sum,wind_speed
0,1455,21,81.0,802.0,6,ORD,24.0,2006,21,21,6,2006,2006-06-21 14:55:00,55,5.0,0.0,25.2
1,750,8,0.0,2288.0,5,LAX,13.0,2007,8,8,5,2007,2007-05-08 07:50:00,0,0.0,0.0,27.4
2,715,16,-7.0,677.0,3,HOU,6.0,2007,16,16,3,2007,2007-03-16 07:15:00,2,0.0,0.0,23.3
3,820,22,-1.0,872.0,10,MEM,11.0,2006,22,22,10,2006,2006-10-22 08:20:00,3,0.0,0.0,20.1
4,1345,15,-4.0,745.0,8,ATL,10.0,2004,15,15,8,2004,2004-08-15 13:45:00,51,2.0,0.0,10.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2681775,1310,12,7.0,717.0,12,ORD,13.0,2006,12,12,12,2006,2006-12-12 13:10:00,53,7.0,0.0,19.9
2681776,955,24,8.0,1587.0,10,PHX,13.0,2007,24,24,10,2007,2007-10-24 09:55:00,0,0.0,0.0,24.0
2681777,1405,2,-4.0,1235.0,4,BWI,10.0,2004,2,2,4,2004,2004-04-02 14:05:00,53,23.0,0.0,12.5
2681778,745,30,-7.0,804.0,8,RNO,10.0,2006,30,30,8,2006,2006-08-30 07:45:00,0,0.0,0.0,17.5


In [19]:
# df_weather.to_csv("delayed flights with region and weather.csv")