In [1]:
#load packages
import pandas as pd 
import numpy as np
import fuzzymatcher
pd.set_option('display.max_columns', 999)

In [4]:
#load data and convert dates to datetime
fw = pd.read_csv("../Datasets/29Nov20-28Feb21/FW_tail_numbers (29Nov20-28Feb21).csv")
fw.head()

Unnamed: 0,tail_number,date,aircraft,origin,origin_location,destination,destination_location,departure,arrival,duration,medical_service
0,N7025P,28-Feb-2021,Unknown,"Near Bend, OR",L 43.95194 -121.28694,,Unknown,First seen 04:59PM PST,,En Route,A PRECIOUS LIFE FLIGHT LLC ...
1,N7025P,28-Feb-2021,Unknown,Tews Fld (CA53),"Tews Fld (Redding, CA) - CA53","Near Bend, OR",L 43.81944 -121.37583,01:36PM PST,Last seen 03:01PM PST,1:24,A PRECIOUS LIFE FLIGHT LLC ...
2,N7025P,21-Feb-2021,Unknown,"Near Lincoln, CA",L 38.93333 -121.41667,Redding Muni (KRDD),"Redding Muni (Redding, CA) - KRDD",First seen 03:02PM PST,Last seen 03:49PM PST,0:47,A PRECIOUS LIFE FLIGHT LLC ...
3,N7025P,21-Feb-2021,Unknown,"Near Red Bluff, CA",L 40.24000 -122.19083,"Near Marysville, CA",L 38.94750 -121.52250,First seen 12:46PM PST,Last seen 01:20PM PST,0:33,A PRECIOUS LIFE FLIGHT LLC ...
4,N7025P,24-Dec-2020,Unknown,"Near Red Bluff, CA",L 40.28861 -122.05861,"Near Emigrant Gap, CA",L 39.77972 -120.52333,First seen 09:25AM PST,Last seen 10:02AM PST,0:36,A PRECIOUS LIFE FLIGHT LLC ...


In [5]:
# Remove useless words
fw = fw.replace(regex={'Near ':'', 'First seen ':'', 'Last seen ':'', 'En Route':None, 'Unknown':None, 'Diverted':None})

In [6]:
# check for duplicates
fw.duplicated().sum()

0

In [7]:
# check for null values
fw.isna().sum()

tail_number               0
date                      0
aircraft                 26
origin                    0
origin_location           0
destination               2
destination_location      2
departure                 0
arrival                 229
duration                232
medical_service           0
dtype: int64

In [8]:
# split origin_Latitude and origin_Longitude
fw['origin_Latitude'] = fw[fw.origin_location.str.startswith('L ')].origin_location.replace(regex={'L ':''}).str.split(" ", n = 1, expand = True)[0]
fw['origin_Longitude'] = fw[fw.origin_location.str.startswith('L ')].origin_location.replace(regex={'L ':''}).str.split(" ", n = 1, expand = True)[1]

In [9]:
# split destination_Latitude and destination_Longitude
fw['destination_Latitude'] = fw[fw.destination_location.str.startswith('L ', na=False)].destination_location.replace(regex={'L ':''}).str.split(" ", n = 1, expand = True)[0]
fw['destination_Longitude'] = fw[fw.destination_location.str.startswith('L ', na=False)].destination_location.replace(regex={'L ':''}).str.split(" ", n = 1, expand = True)[1]

In [10]:
# convert to numeric
fw['origin_Latitude'] = pd.to_numeric(fw.origin_Latitude, errors='coerce')
fw['origin_Longitude'] = pd.to_numeric(fw.origin_Longitude, errors='coerce')

fw['destination_Latitude'] = pd.to_numeric(fw.destination_Latitude, errors='coerce')
fw['destination_Longitude'] = pd.to_numeric(fw.destination_Longitude, errors='coerce')

In [11]:
# split ICAO_code
fw['ICAO_code_origin'] = fw[fw.origin_Latitude.isna()].origin_location.str.rsplit(" - ", n = 1, expand = True)[1]
fw['ICAO_code_destination'] = fw[fw.destination_Latitude.isna()].destination_location.str.rsplit(" - ", n = 1, expand = True)[1]

In [13]:
fw.head()

Unnamed: 0,tail_number,date,aircraft,origin,origin_location,destination,destination_location,departure,arrival,duration,medical_service,origin_Latitude,origin_Longitude,destination_Latitude,destination_Longitude,ICAO_code_origin,ICAO_code_destination
0,N7025P,28-Feb-2021,,"Bend, OR",L 43.95194 -121.28694,,,04:59PM PST,,,A PRECIOUS LIFE FLIGHT LLC ...,43.95194,-121.28694,,,,
1,N7025P,28-Feb-2021,,Tews Fld (CA53),"Tews Fld (Redding, CA) - CA53","Bend, OR",L 43.81944 -121.37583,01:36PM PST,03:01PM PST,1:24,A PRECIOUS LIFE FLIGHT LLC ...,,,43.81944,-121.37583,CA53,
2,N7025P,21-Feb-2021,,"Lincoln, CA",L 38.93333 -121.41667,Redding Muni (KRDD),"Redding Muni (Redding, CA) - KRDD",03:02PM PST,03:49PM PST,0:47,A PRECIOUS LIFE FLIGHT LLC ...,38.93333,-121.41667,,,,KRDD
3,N7025P,21-Feb-2021,,"Red Bluff, CA",L 40.24000 -122.19083,"Marysville, CA",L 38.94750 -121.52250,12:46PM PST,01:20PM PST,0:33,A PRECIOUS LIFE FLIGHT LLC ...,40.24,-122.19083,38.9475,-121.5225,,
4,N7025P,24-Dec-2020,,"Red Bluff, CA",L 40.28861 -122.05861,"Emigrant Gap, CA",L 39.77972 -120.52333,09:25AM PST,10:02AM PST,0:36,A PRECIOUS LIFE FLIGHT LLC ...,40.28861,-122.05861,39.77972,-120.52333,,


In [14]:
# load airport code data
airport_codes = pd.read_csv("../Datasets/Airports/airport-codes.csv")
airport_codes.head()

FileNotFoundError: [Errno 2] No such file or directory: '../Datasets/Airports/airport-codes.csv'

In [84]:
# check duplicates
airport_codes.duplicated().sum()

0

In [85]:
# split coordinates and convert to numeric
airport_codes["Latitude"] = pd.to_numeric(airport_codes.coordinates.str.split(', ', n=1, expand=True)[1])
airport_codes["Longitude"] = pd.to_numeric(airport_codes.coordinates.str.split(', ', n=1, expand=True)[0])

In [86]:
# merge origin airport information
fw = fw.merge(airport_codes[['ident', 'iata_code', 'Latitude', 'Longitude']].add_suffix('_origin'),
         how='left', left_on='ICAO_code_origin', right_on='ident_origin')

In [87]:
# merge destination airport information
fw = fw.merge(airport_codes[['ident', 'iata_code', 'Latitude', 'Longitude']].add_suffix('_destination'),
         how='left', left_on='ICAO_code_destination', right_on='ident_destination')

In [88]:
# merge 2 columns
fw.origin_Latitude = fw.origin_Latitude.fillna(0) + fw.Latitude_origin.fillna(0)
fw.origin_Longitude = fw.origin_Longitude.fillna(0) + fw.Longitude_origin.fillna(0)

fw.destination_Latitude = fw.destination_Latitude.fillna(0) + fw.Latitude_destination.fillna(0)
fw.destination_Longitude = fw.destination_Longitude.fillna(0) + fw.Longitude_destination.fillna(0)

# drop duplicated columns
fw = fw.drop(['ident_origin', 'ident_destination', 'Latitude_origin', 'Longitude_origin', 'Latitude_destination', 'Longitude_destination'], axis=1)

In [89]:
# load airport data
airports = pd.read_csv("..//Datasets/Airports/BTS_Airports_LAT_LON.csv")

In [90]:
airports.head()

Unnamed: 0,AIRPORT_CODE,AIRPORT_NAME,CITY_NAME,COUNTRY_NAME,STATE_NAME,STATE_CODE,LATITUDE,LONGITUDE
0,01A,Afognak Lake Airport,Afognak Lake - AK,United States,Alaska,AK,58.109444,-152.906667
1,03A,Bear Creek Mining Strip,Granite Mountain - AK,United States,Alaska,AK,65.548056,-161.071667
2,04A,Lik Mining Camp,Lik - AK,United States,Alaska,AK,68.083333,-163.166667
3,05A,Little Squaw Airport,Little Squaw - AK,United States,Alaska,AK,67.57,-148.183889
4,06A,Kizhuyak Bay,Kizhuyak - AK,United States,Alaska,AK,57.745278,-152.882778


In [91]:
airports = airports.loc[airports.LONGITUDE < 0]

In [92]:
# replace abbreviations to full name for the best matching
fw = fw.replace(regex={r'Muni': 'Municipal', r'Rgnl':'Regional', r'Intl':'International', r"Int'l":'International',
                 r'Trml':'Terminal', r'Fld':'Field'})

# split states fot the best matching
fw['origin_state'] = fw.origin.str.split(', ', n=1, expand=True)[1]
fw['destination_state'] = fw.destination.str.split(', ', n=1, expand=True)[1]

In [93]:
# fuzzy merge on locations with origin airports
fw = fuzzymatcher.fuzzy_left_join(fw, airports.add_suffix('_origin'), left_on = ['origin_state', 'origin_Latitude', 'origin_Longitude', 'origin'], 
                                  right_on = ['STATE_CODE_origin', 'LATITUDE_origin', 'LONGITUDE_origin','AIRPORT_NAME_origin']).iloc[:,3:].reset_index(drop=True)

In [94]:
# replace null values in iata_code_origin
fw.loc[fw.iata_code_origin.isna(),'iata_code_origin']  = fw.loc[fw.iata_code_origin.isna(), 'AIRPORT_CODE_origin']

In [95]:
# fuzzy merge on locations with destination airports
fw = fuzzymatcher.fuzzy_left_join(fw, airports.add_suffix('_destination'), left_on = ['destination_state', 'destination_Latitude', 'destination_Longitude', 'destination'], 
                                  right_on = ['STATE_CODE_destination', 'LATITUDE_destination', 'LONGITUDE_destination','AIRPORT_NAME_destination']).iloc[:,3:].reset_index(drop=True)

In [96]:
# replace null values in iata_code_destination
fw.loc[fw.iata_code_destination.isna(),'iata_code_destination']  = fw.loc[fw.iata_code_destination.isna(), 'AIRPORT_CODE_destination']

In [97]:
# drop duplicated rows
fw = fw.drop(['destination_state', 'AIRPORT_CODE_destination', 'LATITUDE_destination', 'LONGITUDE_destination'], axis=1)
fw = fw.drop(['origin_state', 'AIRPORT_CODE_origin', 'LATITUDE_origin', 'LONGITUDE_origin'], axis=1)
fw.iloc[[0,1373], -5:] = None
fw.loc[fw.origin_Longitude > 0] = fw.loc[fw.origin_Longitude > 0].drop(5463, axis=0)

In [99]:
# complete ICAO codes 
fw.loc[fw.ICAO_code_origin.isna(), 'ICAO_code_origin'] = fw.merge(airport_codes[['iata_code','ident']], how='left', left_on='iata_code_origin', right_on='iata_code').ident
fw.loc[fw.ICAO_code_destination.isna(), 'ICAO_code_destination'] = fw.merge(airport_codes[['iata_code','ident']], how='left', left_on='iata_code_destination', right_on='iata_code').ident

In [103]:
fw.head()

Unnamed: 0,tail_number,date,aircraft,origin,origin_location,destination,destination_location,departure,arrival,duration,medical_service,origin_Latitude,origin_Longitude,destination_Latitude,destination_Longitude,ICAO_code_origin,ICAO_code_destination,iata_code_origin,iata_code_destination,AIRPORT_NAME_origin,CITY_NAME_origin,COUNTRY_NAME_origin,STATE_NAME_origin,STATE_CODE_origin,AIRPORT_NAME_destination,CITY_NAME_destination,COUNTRY_NAME_destination,STATE_NAME_destination,STATE_CODE_destination
0,N7025P,28-Feb-2021,,"Bend, OR",L 43.95194 -121.28694,,,04:59PM PST,,,A PRECIOUS LIFE FLIGHT LLC ...,43.95194,-121.28694,0.0,0.0,,LFRK,OR4,CFR,Bend Municipal,Bend - OR,United States,Oregon,OR,,,,,
1,N7025P,28-Feb-2021,,Tews Field (CA53),"Tews Field (Redding, CA) - CA53","Bend, OR",L 43.81944 -121.37583,01:36PM PST,03:01PM PST,1:24,A PRECIOUS LIFE FLIGHT LLC ...,40.672501,-122.342003,43.81944,-121.37583,CA53,,O85,OR4,Benton Field,Redding - CA,United States,California,CA,Bend Municipal,Bend - OR,United States,Oregon,OR
2,N7025P,21-Feb-2021,,"Lincoln, CA",L 38.93333 -121.41667,Redding Municipal (KRDD),"Redding Municipal (Redding, CA) - KRDD",03:02PM PST,03:49PM PST,0:47,A PRECIOUS LIFE FLIGHT LLC ...,38.93333,-121.41667,40.508999,-122.292999,KSMF,KRDD,SMF,RDD,Sacramento International,Sacramento - CA,United States,California,CA,Redding Municipal,Redding - CA,United States,California,CA
3,N7025P,21-Feb-2021,,"Red Bluff, CA",L 40.24000 -122.19083,"Marysville, CA",L 38.94750 -121.52250,12:46PM PST,01:20PM PST,0:33,A PRECIOUS LIFE FLIGHT LLC ...,40.24,-122.19083,38.9475,-121.5225,KRBL,KSMF,RBL,SMF,Red Bluff Municipal,Red Bluff - CA,United States,California,CA,Sacramento International,Sacramento - CA,United States,California,CA
4,N7025P,24-Dec-2020,,"Red Bluff, CA",L 40.28861 -122.05861,"Emigrant Gap, CA",L 39.77972 -120.52333,09:25AM PST,10:02AM PST,0:36,A PRECIOUS LIFE FLIGHT LLC ...,40.28861,-122.05861,39.77972,-120.52333,KRBL,,RBL,FVP,Red Bluff Municipal,Red Bluff - CA,United States,California,CA,Gansner Field,Quincy - CA,United States,California,CA


In [104]:
#fw.to_csv('..//Datasets/FW/FW_with_airports.csv', index=False)