In [1]:
#load packages
import pandas as pd 
import numpy as np
import fuzzymatcher
pd.set_option('display.max_columns', 999)

In [2]:
#load data and convert dates to datetime
fw = pd.read_csv("../Datasets/29Nov20-28Feb21/FW_tail_numbers (29Nov20-28Feb21).csv")
fw.head()

Unnamed: 0,tail_number,date,aircraft,origin,origin_location,destination,destination_location,departure,arrival,duration,medical_service
0,N7025P,28-Feb-21,Unknown,"Near Bend, OR",L 43.95194 -121.28694,,Unknown,First seen 04:59PM PST,,En Route,A PRECIOUS LIFE FLIGHT LLC ...
1,N7025P,28-Feb-21,Unknown,Tews Fld (CA53),"Tews Fld (Redding, CA) - CA53","Near Bend, OR",L 43.81944 -121.37583,01:36PM PST,Last seen 03:01PM PST,1:24,A PRECIOUS LIFE FLIGHT LLC ...
2,N7025P,21-Feb-21,Unknown,"Near Lincoln, CA",L 38.93333 -121.41667,Redding Muni (KRDD),"Redding Muni (Redding, CA) - KRDD",First seen 03:02PM PST,Last seen 03:49PM PST,0:47,A PRECIOUS LIFE FLIGHT LLC ...
3,N7025P,21-Feb-21,Unknown,"Near Red Bluff, CA",L 40.24000 -122.19083,"Near Marysville, CA",L 38.94750 -121.52250,First seen 12:46PM PST,Last seen 01:20PM PST,0:33,A PRECIOUS LIFE FLIGHT LLC ...
4,N7025P,24-Dec-20,Unknown,"Near Red Bluff, CA",L 40.28861 -122.05861,"Near Emigrant Gap, CA",L 39.77972 -120.52333,First seen 09:25AM PST,Last seen 10:02AM PST,0:36,A PRECIOUS LIFE FLIGHT LLC ...


In [3]:
# Remove useless words
fw = fw.replace(regex={'Near ':'', 'First seen ':'', 'Last seen ':'', 'En Route':None, 'Unknown':None, 'Diverted':None, '¬†':' '})

In [4]:
# check for duplicates
fw.duplicated().sum()

0

In [5]:
# check for null values
fw.isna().sum()

tail_number               0
date                      0
aircraft                 26
origin                    0
origin_location           0
destination               2
destination_location      2
departure                 0
arrival                 229
duration                232
medical_service           0
dtype: int64

In [6]:
# split origin_Latitude and origin_Longitude
fw['origin_Latitude'] = fw[fw.origin_location.str.startswith('L ')].origin_location.replace(regex={'L ':''}).str.split(" ", n = 1, expand = True)[0]
fw['origin_Longitude'] = fw[fw.origin_location.str.startswith('L ')].origin_location.replace(regex={'L ':''}).str.split(" ", n = 1, expand = True)[1]

In [7]:
# split destination_Latitude and destination_Longitude
fw['destination_Latitude'] = fw[fw.destination_location.str.startswith('L ', na=False)].destination_location.replace(regex={'L ':''}).str.split(" ", n = 1, expand = True)[0]
fw['destination_Longitude'] = fw[fw.destination_location.str.startswith('L ', na=False)].destination_location.replace(regex={'L ':''}).str.split(" ", n = 1, expand = True)[1]

In [8]:
# convert to numeric
fw['origin_Latitude'] = pd.to_numeric(fw.origin_Latitude, errors='coerce')
fw['origin_Longitude'] = pd.to_numeric(fw.origin_Longitude, errors='coerce')

fw['destination_Latitude'] = pd.to_numeric(fw.destination_Latitude, errors='coerce')
fw['destination_Longitude'] = pd.to_numeric(fw.destination_Longitude, errors='coerce')

In [9]:
# split ICAO_code
fw['ICAO_code_origin'] = fw[fw.origin_Latitude.isna()].origin_location.str.rsplit(" - ", n = 1, expand = True)[1]
fw['ICAO_code_destination'] = fw[fw.destination_Latitude.isna()].destination_location.str.rsplit(" - ", n = 1, expand = True)[1]

In [10]:
fw.head()

Unnamed: 0,tail_number,date,aircraft,origin,origin_location,destination,destination_location,departure,arrival,duration,medical_service,origin_Latitude,origin_Longitude,destination_Latitude,destination_Longitude,ICAO_code_origin,ICAO_code_destination
0,N7025P,28-Feb-21,,"Bend, OR",L 43.95194 -121.28694,,,04:59PM PST,,,A PRECIOUS LIFE FLIGHT LLC ...,43.95194,-121.28694,,,,
1,N7025P,28-Feb-21,,Tews Fld (CA53),"Tews Fld (Redding, CA) - CA53","Bend, OR",L 43.81944 -121.37583,01:36PM PST,03:01PM PST,1:24,A PRECIOUS LIFE FLIGHT LLC ...,,,43.81944,-121.37583,CA53,
2,N7025P,21-Feb-21,,"Lincoln, CA",L 38.93333 -121.41667,Redding Muni (KRDD),"Redding Muni (Redding, CA) - KRDD",03:02PM PST,03:49PM PST,0:47,A PRECIOUS LIFE FLIGHT LLC ...,38.93333,-121.41667,,,,KRDD
3,N7025P,21-Feb-21,,"Red Bluff, CA",L 40.24000 -122.19083,"Marysville, CA",L 38.94750 -121.52250,12:46PM PST,01:20PM PST,0:33,A PRECIOUS LIFE FLIGHT LLC ...,40.24,-122.19083,38.9475,-121.5225,,
4,N7025P,24-Dec-20,,"Red Bluff, CA",L 40.28861 -122.05861,"Emigrant Gap, CA",L 39.77972 -120.52333,09:25AM PST,10:02AM PST,0:36,A PRECIOUS LIFE FLIGHT LLC ...,40.28861,-122.05861,39.77972,-120.52333,,


In [11]:
# load airport code data
airport_codes = pd.read_csv("../Datasets/Airports/airport-codes.csv")
airport_codes.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125"
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
4,00AR,closed,Newport Hospital & Clinic Heliport,237.0,,US,US-AR,Newport,,,,"-91.254898, 35.6087"


In [12]:
# check duplicates
airport_codes.duplicated(subset=['ident']).sum()

0

In [13]:
# split coordinates and convert to numeric
airport_codes["Latitude"] = pd.to_numeric(airport_codes.coordinates.str.split(', ', n=1, expand=True)[1])
airport_codes["Longitude"] = pd.to_numeric(airport_codes.coordinates.str.split(', ', n=1, expand=True)[0])

In [14]:
# merge origin airport information
fw = fw.merge(airport_codes[['ident', 'iata_code', 'Latitude', 'Longitude']].add_suffix('_origin'),
         how='left', left_on='ICAO_code_origin', right_on='ident_origin')

In [15]:
# merge destination airport information
fw = fw.merge(airport_codes[['ident', 'iata_code', 'Latitude', 'Longitude']].add_suffix('_destination'),
         how='left', left_on='ICAO_code_destination', right_on='ident_destination')

In [16]:
# merge 2 columns
fw.origin_Latitude = fw.origin_Latitude.fillna(0) + fw.Latitude_origin.fillna(0)
fw.origin_Longitude = fw.origin_Longitude.fillna(0) + fw.Longitude_origin.fillna(0)

fw.destination_Latitude = fw.destination_Latitude.fillna(0) + fw.Latitude_destination.fillna(0)
fw.destination_Longitude = fw.destination_Longitude.fillna(0) + fw.Longitude_destination.fillna(0)

In [17]:
# drop duplicated columns
fw = fw.drop(['ident_origin', 'ident_destination', 'Latitude_origin', 'Longitude_origin', 'Latitude_destination', 'Longitude_destination'], axis=1)

In [18]:
fw.head()

Unnamed: 0,tail_number,date,aircraft,origin,origin_location,destination,destination_location,departure,arrival,duration,medical_service,origin_Latitude,origin_Longitude,destination_Latitude,destination_Longitude,ICAO_code_origin,ICAO_code_destination,iata_code_origin,iata_code_destination
0,N7025P,28-Feb-21,,"Bend, OR",L 43.95194 -121.28694,,,04:59PM PST,,,A PRECIOUS LIFE FLIGHT LLC ...,43.95194,-121.28694,0.0,0.0,,,,
1,N7025P,28-Feb-21,,Tews Fld (CA53),"Tews Fld (Redding, CA) - CA53","Bend, OR",L 43.81944 -121.37583,01:36PM PST,03:01PM PST,1:24,A PRECIOUS LIFE FLIGHT LLC ...,40.672501,-122.342003,43.81944,-121.37583,CA53,,,
2,N7025P,21-Feb-21,,"Lincoln, CA",L 38.93333 -121.41667,Redding Muni (KRDD),"Redding Muni (Redding, CA) - KRDD",03:02PM PST,03:49PM PST,0:47,A PRECIOUS LIFE FLIGHT LLC ...,38.93333,-121.41667,40.508999,-122.292999,,KRDD,,RDD
3,N7025P,21-Feb-21,,"Red Bluff, CA",L 40.24000 -122.19083,"Marysville, CA",L 38.94750 -121.52250,12:46PM PST,01:20PM PST,0:33,A PRECIOUS LIFE FLIGHT LLC ...,40.24,-122.19083,38.9475,-121.5225,,,,
4,N7025P,24-Dec-20,,"Red Bluff, CA",L 40.28861 -122.05861,"Emigrant Gap, CA",L 39.77972 -120.52333,09:25AM PST,10:02AM PST,0:36,A PRECIOUS LIFE FLIGHT LLC ...,40.28861,-122.05861,39.77972,-120.52333,,,,


In [19]:
# load airport data
airports = pd.read_csv("..//Datasets/Airports/BTS_Airports_LAT_LON.csv")

In [20]:
airports.head()

Unnamed: 0,AIRPORT_CODE,AIRPORT_NAME,CITY_NAME,COUNTRY_NAME,STATE_NAME,STATE_CODE,LATITUDE,LONGITUDE
0,01A,Afognak Lake Airport,Afognak Lake - AK,United States,Alaska,AK,58.109444,-152.906667
1,03A,Bear Creek Mining Strip,Granite Mountain - AK,United States,Alaska,AK,65.548056,-161.071667
2,04A,Lik Mining Camp,Lik - AK,United States,Alaska,AK,68.083333,-163.166667
3,05A,Little Squaw Airport,Little Squaw - AK,United States,Alaska,AK,67.57,-148.183889
4,06A,Kizhuyak Bay,Kizhuyak - AK,United States,Alaska,AK,57.745278,-152.882778


In [21]:
# replace abbreviations to full name for the best matching
fw = fw.replace(regex={r'Muni': 'Municipal', r'Rgnl':'Regional', r'Intl':'International', r"Int'l":'International',
                 r'Trml':'Terminal', r'Fld':'Field'})

In [22]:
# split states fot the best matching
fw['origin_state'] = fw.origin.str.split(', ', n=1, expand=True)[1]
fw.loc[fw.origin_state.isna(), 'origin_state'] = fw.origin_location.str.split(', ', n=1, expand=True)[1].str.rsplit(') ', n=1, expand=True)[0]

fw['destination_state'] = fw.destination.str.split(', ', n=1, expand=True)[1]
fw.loc[fw.destination_state.isna(), 'destination_state'] = fw.destination_location.str.split(', ', n=1, expand=True)[1].str.rsplit(') ', n=1, expand=True)[0]

In [23]:
# replace location back to NA for the best matching
fw.loc[fw.origin_Latitude == 0, 'origin_Latitude'] = np.nan
fw.loc[fw.origin_Longitude == 0, 'origin_Longitude'] = np.nan

fw.loc[fw.destination_Latitude == 0, 'destination_Latitude'] = np.nan
fw.loc[fw.destination_Longitude == 0, 'destination_Longitude'] = np.nan

In [24]:
fw.head()

Unnamed: 0,tail_number,date,aircraft,origin,origin_location,destination,destination_location,departure,arrival,duration,medical_service,origin_Latitude,origin_Longitude,destination_Latitude,destination_Longitude,ICAO_code_origin,ICAO_code_destination,iata_code_origin,iata_code_destination,origin_state,destination_state
0,N7025P,28-Feb-21,,"Bend, OR",L 43.95194 -121.28694,,,04:59PM PST,,,A PRECIOUS LIFE FLIGHT LLC ...,43.95194,-121.28694,,,,,,,OR,
1,N7025P,28-Feb-21,,Tews Field (CA53),"Tews Field (Redding, CA) - CA53","Bend, OR",L 43.81944 -121.37583,01:36PM PST,03:01PM PST,1:24,A PRECIOUS LIFE FLIGHT LLC ...,40.672501,-122.342003,43.81944,-121.37583,CA53,,,,CA,OR
2,N7025P,21-Feb-21,,"Lincoln, CA",L 38.93333 -121.41667,Redding Municipal (KRDD),"Redding Municipal (Redding, CA) - KRDD",03:02PM PST,03:49PM PST,0:47,A PRECIOUS LIFE FLIGHT LLC ...,38.93333,-121.41667,40.508999,-122.292999,,KRDD,,RDD,CA,CA
3,N7025P,21-Feb-21,,"Red Bluff, CA",L 40.24000 -122.19083,"Marysville, CA",L 38.94750 -121.52250,12:46PM PST,01:20PM PST,0:33,A PRECIOUS LIFE FLIGHT LLC ...,40.24,-122.19083,38.9475,-121.5225,,,,,CA,CA
4,N7025P,24-Dec-20,,"Red Bluff, CA",L 40.28861 -122.05861,"Emigrant Gap, CA",L 39.77972 -120.52333,09:25AM PST,10:02AM PST,0:36,A PRECIOUS LIFE FLIGHT LLC ...,40.28861,-122.05861,39.77972,-120.52333,,,,,CA,CA


In [25]:
# fuzzy merge on locations with origin airports
fw = fuzzymatcher.fuzzy_left_join(fw, airports.add_suffix('_origin'), left_on = ['origin_state', 'origin_Latitude', 'origin_Longitude', 'origin'], 
                                  right_on = ['STATE_CODE_origin', 'LATITUDE_origin', 'LONGITUDE_origin','AIRPORT_NAME_origin']).iloc[:,3:].reset_index(drop=True)

In [26]:
# drop LONGITUDE_origin mismatching
fw.loc[(fw.LONGITUDE_origin > 0) & (fw.origin_state.notnull()) & (fw.destination_state.notnull()), 
       ['AIRPORT_CODE_origin', 'AIRPORT_NAME_origin', 'CITY_NAME_origin', 'COUNTRY_NAME_origin', 
        'STATE_NAME_origin', 'STATE_CODE_origin', 'LATITUDE_origin', 'LONGITUDE_origin']] = np.nan

In [27]:
# drop origin_state mismatching
fw.loc[(fw.origin_state != fw.STATE_CODE_origin) & (fw.STATE_CODE_origin.notnull()), 
       ['AIRPORT_CODE_origin', 'AIRPORT_NAME_origin', 'CITY_NAME_origin', 'COUNTRY_NAME_origin', 
        'STATE_NAME_origin', 'STATE_CODE_origin', 'LATITUDE_origin', 'LONGITUDE_origin']] = np.nan

In [28]:
# replace null values in iata_code_origin
fw.loc[fw.iata_code_origin.isna(),'iata_code_origin']  = fw.loc[fw.iata_code_origin.isna(), 'AIRPORT_CODE_origin']

In [29]:
# fuzzy merge on locations with destination airports
fw = fuzzymatcher.fuzzy_left_join(fw, airports.add_suffix('_destination'), left_on = ['destination_state', 'destination_Latitude', 'destination_Longitude', 'destination'], 
                                  right_on = ['STATE_CODE_destination', 'LATITUDE_destination', 'LONGITUDE_destination','AIRPORT_NAME_destination']).iloc[:,3:].reset_index(drop=True)

In [30]:
# drop LONGITUDE_destination mismatching
fw.loc[(fw.LONGITUDE_destination > 0) & (fw.origin_state.notnull()) & (fw.destination_state.notnull()), 
       ['AIRPORT_CODE_destination', 'AIRPORT_NAME_destination', 'CITY_NAME_destination', 'COUNTRY_NAME_destination', 
        'STATE_NAME_destination', 'STATE_CODE_destination', 'LATITUDE_destination', 'LONGITUDE_destination']] = np.nan

In [31]:
# drop destination_state mismatching
fw.loc[(fw.destination_state != fw.STATE_CODE_destination) & (fw.STATE_CODE_destination.notnull()), 
       ['AIRPORT_CODE_destination', 'AIRPORT_NAME_destination', 'CITY_NAME_destination', 'COUNTRY_NAME_destination', 
        'STATE_NAME_destination', 'STATE_CODE_destination', 'LATITUDE_destination', 'LONGITUDE_destination']] = np.nan

In [32]:
# replace null values in iata_code_destination
fw.loc[fw.iata_code_destination.isna(),'iata_code_destination']  = fw.loc[fw.iata_code_destination.isna(), 'AIRPORT_CODE_destination']

In [33]:
# drop duplicated rows
fw = fw.drop(['STATE_CODE_origin', 'AIRPORT_CODE_origin', 'LATITUDE_origin', 'LONGITUDE_origin'], axis=1)
fw = fw.drop(['STATE_CODE_destination', 'AIRPORT_CODE_destination', 'LATITUDE_destination', 'LONGITUDE_destination'], axis=1)

In [34]:
# rearrange for convenience
fw = fw.iloc[:,:15].join([fw.iloc[:,15], fw.iloc[:,17], fw.iloc[:, 21:25], fw.iloc[:,19],
                    fw.iloc[:,16], fw.iloc[:,18], fw.iloc[:, 25:29], fw.iloc[:,20]])

In [35]:
# complete origin ICAO codes
fw = fw.merge(airport_codes.drop_duplicates(subset=['iata_code'])[['iata_code','ident']], how='left', left_on='iata_code_origin', right_on='iata_code')
fw.loc[(fw.ICAO_code_origin.isna()) & (fw.iata_code_origin.notnull()), 'ICAO_code_origin'] = fw.loc[:,'ident']
fw = fw.drop(['ident','iata_code'], axis=1)

In [37]:
# complete destination ICAO codes
fw = fw.merge(airport_codes.drop_duplicates(subset=['iata_code'])[['iata_code','ident']], how='left', left_on='iata_code_destination', right_on='iata_code')
fw.loc[(fw.ICAO_code_destination.isna()) & (fw.iata_code_destination.notnull()), 'ICAO_code_destination'] = fw.loc[:,'ident']
fw = fw.drop(['ident','iata_code'], axis=1)

In [36]:
fw

Unnamed: 0,tail_number,date,aircraft,origin,origin_location,destination,destination_location,departure,arrival,duration,medical_service,origin_Latitude,origin_Longitude,destination_Latitude,destination_Longitude,ICAO_code_origin,iata_code_origin,AIRPORT_NAME_origin,CITY_NAME_origin,COUNTRY_NAME_origin,STATE_NAME_origin,origin_state,ICAO_code_destination,iata_code_destination,AIRPORT_NAME_destination,CITY_NAME_destination,COUNTRY_NAME_destination,STATE_NAME_destination,destination_state
0,N7025P,28-Feb-21,,"Bend, OR",L 43.95194 -121.28694,,,04:59PM PST,,,A PRECIOUS LIFE FLIGHT LLC ...,43.951940,-121.286940,,,,OR4,Bend Municipal,Bend - OR,United States,Oregon,OR,,,,,,,
1,N7025P,28-Feb-21,,Tews Field (CA53),"Tews Field (Redding, CA) - CA53","Bend, OR",L 43.81944 -121.37583,01:36PM PST,03:01PM PST,1:24,A PRECIOUS LIFE FLIGHT LLC ...,40.672501,-122.342003,43.819440,-121.375830,CA53,O85,Benton Field,Redding - CA,United States,California,CA,,OR4,Bend Municipal,Bend - OR,United States,Oregon,OR
2,N7025P,21-Feb-21,,"Lincoln, CA",L 38.93333 -121.41667,Redding Municipal (KRDD),"Redding Municipal (Redding, CA) - KRDD",03:02PM PST,03:49PM PST,0:47,A PRECIOUS LIFE FLIGHT LLC ...,38.933330,-121.416670,40.508999,-122.292999,KSMF,SMF,Sacramento International,Sacramento - CA,United States,California,CA,KRDD,RDD,Redding Municipal,Redding - CA,United States,California,CA
3,N7025P,21-Feb-21,,"Red Bluff, CA",L 40.24000 -122.19083,"Marysville, CA",L 38.94750 -121.52250,12:46PM PST,01:20PM PST,0:33,A PRECIOUS LIFE FLIGHT LLC ...,40.240000,-122.190830,38.947500,-121.522500,KRBL,RBL,Red Bluff Municipal,Red Bluff - CA,United States,California,CA,,SMF,Sacramento International,Sacramento - CA,United States,California,CA
4,N7025P,24-Dec-20,,"Red Bluff, CA",L 40.28861 -122.05861,"Emigrant Gap, CA",L 39.77972 -120.52333,09:25AM PST,10:02AM PST,0:36,A PRECIOUS LIFE FLIGHT LLC ...,40.288610,-122.058610,39.779720,-120.523330,KRBL,RBL,Red Bluff Municipal,Red Bluff - CA,United States,California,CA,,,,,,,CA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11236,N977TC,30-Nov-20,C680,El Paso International (KELP),"El Paso International (El Paso, TX) - KELP",William P Hobby (KHOU),"William P Hobby (Houston, TX) - KHOU",03:30PM MST,05:54PM CST,1:23,TEXAS CHILDRENS HOSPITAL ...,31.807199,-106.377998,29.645399,-95.278900,KELP,ELP,El Paso International,El Paso - TX,United States,Texas,TX,KHOU,HOU,William P Hobby,Houston - TX,United States,Texas,TX
11237,N977TC,30-Nov-20,C680,William P Hobby (KHOU),"William P Hobby (Houston, TX) - KHOU",El Paso International (KELP),"El Paso International (El Paso, TX) - KELP",11:11AM CST,11:52AM MST,1:40,TEXAS CHILDRENS HOSPITAL ...,29.645399,-95.278900,31.807199,-106.377998,KHOU,HOU,William P Hobby,Houston - TX,United States,Texas,TX,KELP,ELP,El Paso International,El Paso - TX,United States,Texas,TX
11238,N102WK,12-Feb-21,BE9T,Smyrna (KMQY),"Smyrna (Smyrna, TN) - KMQY",Shreveport Regional (KSHV),"Shreveport Regional (Shreveport, LA) - KSHV",11:19AM CST,01:34PM CST,2:14,WILLIS KNIGHTON MEDICAL CENTER ...,36.008999,-86.520103,32.446602,-93.825600,KMQY,MQY,Smyrna Airport,Smyrna - TN,United States,Tennessee,TN,KSHV,SHV,Shreveport Regional,Shreveport - LA,United States,Louisiana,LA
11239,N102WK,28-Jan-21,BE9T,Shreveport Regional (KSHV),"Shreveport Regional (Shreveport, LA) - KSHV",Smyrna (KMQY),"Smyrna (Smyrna, TN) - KMQY",12:30PM CST,02:25PM CST,1:55,WILLIS KNIGHTON MEDICAL CENTER ...,32.446602,-93.825600,36.008999,-86.520103,KSHV,SHV,Shreveport Regional,Shreveport - LA,United States,Louisiana,LA,KMQY,MQY,Smyrna Airport,Smyrna - TN,United States,Tennessee,TN


In [38]:
fw.to_csv('..//Datasets/29Nov20-28Feb21/FW_with_airports(29Nov20-28Feb21 revised).csv', index=False)