## Setup

In [151]:
import pandas as pd

## Read data

### Weather stations

Weather stations dataset contains ..

In [152]:
weather_stations = pd.read_fwf('https://www.ncei.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt', 
                               header=None, colspecs=[(0, 11), (12, 20), (21, 30), (31, 37), (38, 40), (41, 71), (72, 75), (76, 79), (80, 85)], 
                               names=['id', 'lat', 'lon', 'elev', 'state', 'name', 'gsn_flag', 'hcn_flag', 'wmo_id'])

In [153]:
weather_stations = weather_stations.loc[weather_stations['state'].notna()].reset_index(drop=True)
weather_stations.dropna(subset=['lat', 'lon'], inplace=True)
weather_stations.drop(weather_stations[weather_stations['lat'] == 0].index, inplace=True)
weather_stations.drop(weather_stations[weather_stations['lon'] == 0].index, inplace=True)
weather_stations.dropna(subset=['gsn_flag', 'hcn_flag'], how='all', inplace=True)

### Additional airport information
The additional airport information contains geographic coordinates which will be used to determine the closest weather station.

Source: https://data.world/ourairports/989444cc-447b-4030-a866-57fcd6c2d3ee

In [154]:
airport_loc_df = pd.read_csv('https://query.data.world/s/umfxdv54vxyao3jnuhkfr7g6a7ely7?dws=00000', usecols=['latitude_deg', 'longitude_deg', 'iso_country', 'iso_region', 'iata_code'])

Since we are only interested in USA, we filter out other countries.

In [155]:
airport_loc_df = airport_loc_df.loc[airport_loc_df['iso_country'] == 'US'].reset_index(drop=True)
airport_loc_df['state'] = airport_loc_df['iso_region'].str.split('-').str[-1]
airport_loc_df.drop(columns=['iso_country', 'iso_region'], inplace=True)

The capital of USA is Wahington D.C., which is technically not a state, but a federal district is registered as being.

In [156]:
airport_loc_df['state'] = airport_loc_df['state'].replace('DC', 'VA')

In [157]:
airport_loc_df.rename({'latitude_deg': 'Latitude', 'longitude_deg': 'Longitude', 'iata_code': '', 'state': 'State'}, axis=1, inplace=True)

### Flight data

We read the flight data and 

In [150]:
combined_data_full = pd.read_parquet('../datasets/processed/combined_without_encoding.parquet')

In [158]:
combined_data = combined_data_full.head(100000)

## Find closest weather stations 

In [159]:
combined_data = combined_data.merge(airport_loc_df.add_prefix('Origin'), left_on=['Origin', 'OriginState'], right_on=['Origin', 'OriginState'], how='left')
combined_data = combined_data.merge(airport_loc_df.add_prefix('Dest'), left_on=['Dest', 'DestState'], right_on=['Dest', 'DestState'], how='left')

In [160]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102645 entries, 0 to 102644
Data columns (total 28 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   OriginCityName     102645 non-null  category
 1   DayofMonth         102645 non-null  uint8   
 2   OriginAirport      102645 non-null  category
 3   Cancelled          102645 non-null  bool    
 4   DestCityName       102645 non-null  category
 5   Airline            102645 non-null  category
 6   OriginState        102645 non-null  object  
 7   Origin             102645 non-null  object  
 8   DestState          102645 non-null  object  
 9   DestAirport        102645 non-null  category
 10  ArrDelay           102645 non-null  uint16  
 11  Dest               102645 non-null  object  
 12  DayOfWeek          102645 non-null  uint8   
 13  Diverted           102645 non-null  bool    
 14  CRSElapsedTime     102645 non-null  int16   
 15  ArrTime            102645 non-null

In [161]:
combined_data = combined_data[combined_data['Year'] == 2018]
combined_data

Unnamed: 0,OriginCityName,DayofMonth,OriginAirport,Cancelled,DestCityName,Airline,OriginState,Origin,DestState,DestAirport,...,CRSArrTime,Distance,CRSDepTime,Year,Month,ActualElapsedTime,OriginLatitude,OriginLongitude,DestLatitude,DestLongitude
0,Albany,23,Southwest Georgia Regional,False,Atlanta,Endeavor Air Inc.,GA,ABY,GA,Hartsfield-Jackson Atlanta International,...,1304,145,1202,2018,1,59,31.535499572753906,-84.19450378417969,33.63669967651367,-84.4281005859375
1,Albany,24,Southwest Georgia Regional,False,Atlanta,Endeavor Air Inc.,GA,ABY,GA,Hartsfield-Jackson Atlanta International,...,1304,145,1202,2018,1,61,31.535499572753906,-84.19450378417969,33.63669967651367,-84.4281005859375
2,Albany,25,Southwest Georgia Regional,False,Atlanta,Endeavor Air Inc.,GA,ABY,GA,Hartsfield-Jackson Atlanta International,...,1304,145,1202,2018,1,69,31.535499572753906,-84.19450378417969,33.63669967651367,-84.4281005859375
3,Albany,26,Southwest Georgia Regional,False,Atlanta,Endeavor Air Inc.,GA,ABY,GA,Hartsfield-Jackson Atlanta International,...,1304,145,1202,2018,1,63,31.535499572753906,-84.19450378417969,33.63669967651367,-84.4281005859375
4,Albany,27,Southwest Georgia Regional,False,Atlanta,Endeavor Air Inc.,GA,ABY,GA,Hartsfield-Jackson Atlanta International,...,1500,145,1400,2018,1,64,31.535499572753906,-84.19450378417969,33.63669967651367,-84.4281005859375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102640,Atlanta,10,Hartsfield-Jackson Atlanta International,False,Sarasota/Bradenton,Delta Air Lines Inc.,GA,ATL,FL,Sarasota/Bradenton International,...,1157,444,1025,2018,1,86,33.63669967651367,-84.4281005859375,27.39539909362793,-82.55439758300781
102641,Sarasota/Bradenton,10,Sarasota/Bradenton International,False,Atlanta,Delta Air Lines Inc.,FL,SRQ,GA,Hartsfield-Jackson Atlanta International,...,1420,444,1237,2018,1,92,27.39539909362793,-82.55439758300781,33.63669967651367,-84.4281005859375
102642,Washington,10,Ronald Reagan Washington National,False,Los Angeles,Delta Air Lines Inc.,VA,DCA,CA,Los Angeles International,...,2035,2311,1730,2018,1,349,38.8521,-77.037697,33.94250107,-118.4079971
102643,Minneapolis,10,Minneapolis-St Paul International,False,Fort Myers,Delta Air Lines Inc.,MN,MSP,FL,Southwest Florida International,...,1734,1416,1306,2018,1,185,44.881999969499994,-93.22180175780001,26.53619956970215,-81.75520324707031


In [162]:
weather = pd.read_csv('~/Downloads/2018.csv', header=None, names=['id', 'date', 'element', 'value', 'm_flag', 'q_flag', 's_flag', 'obs_time'])

In [163]:
weather['date'] = weather['date'].astype(str)
weather['Year'] = weather['date'].str[:4].astype(int)
weather['Month'] = weather['date'].str[4:6].astype(int)
weather['DayofMonth'] = weather['date'].str[6:8].astype(int)

In [164]:
weather.drop(columns=['date', 'm_flag', 'q_flag', 's_flag', 'obs_time'], inplace=True)

In [165]:
weather = weather.pivot_table('value', ['id', 'Year', 'Month', 'DayofMonth'], 'element').reset_index()

In [166]:
weather = weather[['id', 'Year', 'Month', 'DayofMonth', 'PRCP', 'SNOW', 'SNWD', 'TMAX', 'TMIN']]

In [167]:
weather

element,id,Year,Month,DayofMonth,PRCP,SNOW,SNWD,TMAX,TMIN
0,AE000041196,2018,1,1,,,,259.0,112.0
1,AE000041196,2018,1,2,,,,,
2,AE000041196,2018,1,3,,,,,
3,AE000041196,2018,1,4,,,,,
4,AE000041196,2018,1,5,,,,,
...,...,...,...,...,...,...,...,...,...
11882852,ZI000067983,2018,9,29,,,,281.0,166.0
11882853,ZI000067983,2018,10,20,0.0,,,270.0,152.0
11882854,ZI000067983,2018,11,12,,,,249.0,119.0
11882855,ZI000067983,2018,11,17,,,,,176.0


In [168]:
weather.tail()

element,id,Year,Month,DayofMonth,PRCP,SNOW,SNWD,TMAX,TMIN
11882852,ZI000067983,2018,9,29,,,,281.0,166.0
11882853,ZI000067983,2018,10,20,0.0,,,270.0,152.0
11882854,ZI000067983,2018,11,12,,,,249.0,119.0
11882855,ZI000067983,2018,11,17,,,,,176.0
11882856,ZI000067983,2018,12,3,,,,,155.0


In [169]:
def get_closest_station(lanitude, longitude):
    temp = weather_stations
    temp['distance'] = ((weather_stations['lat'] - lanitude)**2 + (weather_stations['lon'] - longitude)**2)**0.5
    return temp.loc[temp['distance'].idxmin()]['id']

In [170]:
combined_data.dropna(subset=['OriginLatitude', 'OriginLongitude', 'DestLatitude', 'DestLongitude'], inplace=True)
weather_stations.dropna(subset=['lat', 'lon'], inplace=True)

In [171]:
combined_data['OriginLatitude'] = combined_data['OriginLatitude'].astype(float)
combined_data['OriginLongitude'] = combined_data['OriginLongitude'].astype(float)

combined_data['DestLatitude'] = combined_data['DestLatitude'].astype(float)
combined_data['DestLongitude'] = combined_data['DestLongitude'].astype(float)

In [172]:
weather_stations['lat'] = weather_stations['lat'].astype(float)
weather_stations['lon'] = weather_stations['lon'].astype(float)

In [173]:
combined_data['OriginStationId'] = combined_data.apply(lambda x: get_closest_station(x['OriginLatitude'], x['OriginLongitude']), axis=1)

In [174]:
combined_data['DestStationId'] = combined_data.apply(lambda x: get_closest_station(x['DestLatitude'], x['DestLongitude']), axis=1)

In [175]:
combined_data = combined_data.join(weather.set_index(['id', 'Year', 'Month', 'DayofMonth']).add_prefix('Origin'), on=['OriginStationId', 'Year', 'Month', 'DayofMonth'], how='left')

In [176]:
combined_data = combined_data.join(weather.set_index(['id', 'Year', 'Month', 'DayofMonth']).add_prefix('Dest'), on=['DestStationId', 'Year', 'Month', 'DayofMonth'], how='left')

In [177]:
combined_data

Unnamed: 0,OriginCityName,DayofMonth,OriginAirport,Cancelled,DestCityName,Airline,OriginState,Origin,DestState,DestAirport,...,OriginPRCP,OriginSNOW,OriginSNWD,OriginTMAX,OriginTMIN,DestPRCP,DestSNOW,DestSNWD,DestTMAX,DestTMIN
0,Albany,23,Southwest Georgia Regional,False,Atlanta,Endeavor Air Inc.,GA,ABY,GA,Hartsfield-Jackson Atlanta International,...,5.0,0.0,0.0,217.0,94.0,191.0,0.0,0.0,172.0,33.0
1,Albany,24,Southwest Georgia Regional,False,Atlanta,Endeavor Air Inc.,GA,ABY,GA,Hartsfield-Jackson Atlanta International,...,0.0,0.0,0.0,217.0,22.0,0.0,0.0,0.0,139.0,-11.0
2,Albany,25,Southwest Georgia Regional,False,Atlanta,Endeavor Air Inc.,GA,ABY,GA,Hartsfield-Jackson Atlanta International,...,0.0,0.0,0.0,133.0,22.0,0.0,0.0,0.0,133.0,-39.0
3,Albany,26,Southwest Georgia Regional,False,Atlanta,Endeavor Air Inc.,GA,ABY,GA,Hartsfield-Jackson Atlanta International,...,0.0,0.0,0.0,161.0,-17.0,0.0,0.0,0.0,156.0,-39.0
4,Albany,27,Southwest Georgia Regional,False,Atlanta,Endeavor Air Inc.,GA,ABY,GA,Hartsfield-Jackson Atlanta International,...,0.0,0.0,0.0,178.0,-11.0,0.0,0.0,0.0,161.0,-39.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102640,Atlanta,10,Hartsfield-Jackson Atlanta International,False,Sarasota/Bradenton,Delta Air Lines Inc.,GA,ATL,FL,Sarasota/Bradenton International,...,0.0,0.0,0.0,172.0,,0.0,0.0,0.0,261.0,194.0
102641,Sarasota/Bradenton,10,Sarasota/Bradenton International,False,Atlanta,Delta Air Lines Inc.,FL,SRQ,GA,Hartsfield-Jackson Atlanta International,...,0.0,0.0,0.0,261.0,194.0,0.0,0.0,0.0,172.0,
102642,Washington,10,Ronald Reagan Washington National,False,Los Angeles,Delta Air Lines Inc.,VA,DCA,CA,Los Angeles International,...,0.0,0.0,0.0,61.0,-27.0,254.0,,,122.0,67.0
102643,Minneapolis,10,Minneapolis-St Paul International,False,Fort Myers,Delta Air Lines Inc.,MN,MSP,FL,Southwest Florida International,...,0.0,0.0,30.0,33.0,6.0,0.0,,,272.0,167.0


In [178]:
combined_data.isna().sum()

OriginCityName           0
DayofMonth               0
OriginAirport            0
Cancelled                0
DestCityName             0
Airline                  0
OriginState              0
Origin                   0
DestState                0
DestAirport              0
ArrDelay                 0
Dest                     0
DayOfWeek                0
Diverted                 0
CRSElapsedTime           0
ArrTime                  0
DepDelay                 0
DepTime                  0
CRSArrTime               0
Distance                 0
CRSDepTime               0
Year                     0
Month                    0
ActualElapsedTime        0
OriginLatitude           0
OriginLongitude          0
DestLatitude             0
DestLongitude            0
OriginStationId          0
DestStationId            0
OriginPRCP           16900
OriginSNOW           50729
OriginSNWD           50840
OriginTMAX           16823
OriginTMIN           17187
DestPRCP             17001
DestSNOW             50999
D

In [179]:
len(combined_data)

101925

In [183]:
combined_data.drop(columns=['OriginSNOW', 'OriginSNWD', 'DestSNOW', 'DestSNWD'], inplace=True)

In [184]:
combined_data.drop(columns=['Cancelled', 'Diverted'], inplace=True)

In [185]:
combined_data.drop(columns=['OriginStationId', 'DestStationId'], inplace=True)

In [186]:
combined_data.dropna(inplace=True)

In [187]:
for column in combined_data.select_dtypes(include=['object']).columns:
    combined_data[column] = combined_data[column].astype('category')

In [188]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69717 entries, 0 to 102644
Data columns (total 32 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   OriginCityName     69717 non-null  category
 1   DayofMonth         69717 non-null  uint8   
 2   OriginAirport      69717 non-null  category
 3   DestCityName       69717 non-null  category
 4   Airline            69717 non-null  category
 5   OriginState        69717 non-null  category
 6   Origin             69717 non-null  category
 7   DestState          69717 non-null  category
 8   DestAirport        69717 non-null  category
 9   ArrDelay           69717 non-null  uint16  
 10  Dest               69717 non-null  category
 11  DayOfWeek          69717 non-null  uint8   
 12  CRSElapsedTime     69717 non-null  int16   
 13  ArrTime            69717 non-null  uint16  
 14  DepDelay           69717 non-null  uint16  
 15  DepTime            69717 non-null  uint16  
 16  CRS

In [189]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}

for column in combined_data.select_dtypes(include=['category']).columns:
    encoder = LabelEncoder()
    combined_data[column] = encoder.fit_transform(combined_data[column])
    label_encoders[column] = encoder

## Split dependant and independant variables

In [190]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69717 entries, 0 to 102644
Data columns (total 32 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   OriginCityName     69717 non-null  int64  
 1   DayofMonth         69717 non-null  uint8  
 2   OriginAirport      69717 non-null  int64  
 3   DestCityName       69717 non-null  int64  
 4   Airline            69717 non-null  int64  
 5   OriginState        69717 non-null  int64  
 6   Origin             69717 non-null  int64  
 7   DestState          69717 non-null  int64  
 8   DestAirport        69717 non-null  int64  
 9   ArrDelay           69717 non-null  uint16 
 10  Dest               69717 non-null  int64  
 11  DayOfWeek          69717 non-null  uint8  
 12  CRSElapsedTime     69717 non-null  int16  
 13  ArrTime            69717 non-null  uint16 
 14  DepDelay           69717 non-null  uint16 
 15  DepTime            69717 non-null  uint16 
 16  CRSArrTime         69

In [191]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(combined_data, test_size=0.2, random_state=42)

In [193]:
x_train, y_train = train.drop(columns=['ArrDelay', 'ArrTime', 'DepTime', 'DepDelay', 'OriginLatitude', 'OriginLongitude', 'DestLatitude', 'DestLongitude', 'ActualElapsedTime', 'OriginPRCP', 'OriginTMAX', 'OriginTMIN', 'DestPRCP', 'DestTMAX', 'DestTMIN']), train['ArrDelay']
x_test, y_test = test.drop(columns=['ArrDelay', 'ArrTime', 'DepTime', 'DepDelay', 'OriginLatitude', 'OriginLongitude', 'DestLatitude', 'DestLongitude', 'ActualElapsedTime', 'OriginPRCP', 'OriginTMAX', 'OriginTMIN', 'DestPRCP', 'DestTMAX', 'DestTMIN']), test['ArrDelay']
print('Independant features', ", ".join(x_train.columns.to_list()))
print('Dependant features: ', y_train.name)

Independant features OriginCityName, DayofMonth, OriginAirport, DestCityName, Airline, OriginState, Origin, DestState, DestAirport, Dest, DayOfWeek, CRSElapsedTime, CRSArrTime, Distance, CRSDepTime, Year, Month
Dependant features:  ArrDelay


## Model

In [194]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)

In [195]:
rf.fit(x_train, y_train)

In [196]:
predicted = rf.predict(x_test)

In [197]:
y_test

22655      0
7709       6
66482      0
34592      5
93211      4
          ..
56496      0
102404     0
20965     39
13637     26
99952      0
Name: ArrDelay, Length: 13944, dtype: uint16

In [198]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

print('Mean Squared Error:', mean_squared_error(y_test, predicted))
print('Mean Absolute Error:', mean_absolute_error(y_test, predicted))

Mean Squared Error: 2086.6620649261954
Mean Absolute Error: 17.166132029355595


## Model w/ weather

In [200]:
x_train_weather, y_train_weather = train.drop(columns=['ArrDelay', 'ArrTime', 'DepTime', 'DepDelay', 'OriginLatitude', 'OriginLongitude', 'DestLatitude', 'DestLongitude', 'ActualElapsedTime']), train['ArrDelay']
x_test_weather, y_test_weather = test.drop(columns=['ArrDelay', 'ArrTime', 'DepTime', 'DepDelay', 'OriginLatitude', 'OriginLongitude', 'DestLatitude', 'DestLongitude', 'ActualElapsedTime']), test['ArrDelay']
print('Independant features', ", ".join(x_train_weather.columns.to_list()))
print('Dependant features: ', y_train_weather.name)

Independant features OriginCityName, DayofMonth, OriginAirport, DestCityName, Airline, OriginState, Origin, DestState, DestAirport, Dest, DayOfWeek, CRSElapsedTime, CRSArrTime, Distance, CRSDepTime, Year, Month, OriginPRCP, OriginTMAX, OriginTMIN, DestPRCP, DestTMAX, DestTMIN
Dependant features:  ArrDelay


In [201]:
rf_weather = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)

In [202]:
rf_weather.fit(x_train_weather, y_train_weather)

In [203]:
predicted = rf_weather.predict(x_test_weather)

In [204]:
print('Mean Squared Error:', mean_squared_error(y_test_weather, predicted))
print('Mean Absolute Error:', mean_absolute_error(y_test_weather, predicted))

Mean Squared Error: 2073.3955867626805
Mean Absolute Error: 16.84577458502321
