## Load modules and given input

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import BallTree

In [2]:
optd=pd.read_csv(r"data/optd-airports-sample.csv.gz")
for column in optd[["latitude", "longitude"]]:
    rad = np.deg2rad(optd[column].values)
    optd[f'{column}_rad'] = rad
optd.head()

Unnamed: 0,iata_code,latitude,longitude,latitude_rad,longitude_rad
0,AAA,-17.352606,-145.509956,-0.30286,-2.539628
1,AAB,-26.69317,141.0478,-0.465884,2.461749
2,AAC,31.07333,33.83583,0.542332,0.590547
3,AAD,6.09682,46.63825,0.10641,0.813991
4,AAE,36.822225,7.809167,0.642669,0.136296


In [3]:
geo_samples=pd.read_csv(r"data/user-geo-sample.csv.gz")
geo_samples.head()

Unnamed: 0,uuid,geoip_latitude,geoip_longitude
0,DDEFEBEA-98ED-49EB-A4E7-9D7BFDB7AA0B,-37.833302,145.050003
1,DAEF2221-14BE-467B-894A-F101CDCC38E4,52.516701,4.6667
2,31971B3E-2F80-4F8D-86BA-1F2077DF36A2,35.685001,139.751404
3,1A29A45C-D560-43D8-ADAB-C2F0AD068FFE,44.840401,-0.5805
4,A6EC281B-B8EC-465A-8933-F127472DB0A3,51.963299,4.4997


## Define functions

In [10]:
# The amount of neighbors to return.
number_final_neightbors = 1
ball = BallTree(optd[["latitude_rad", "longitude_rad"]].values, metric='haversine')
def closest_airport_v3(geo_sample):
    """
    This function calculates the closest airport using the clustering algorithm Ball Tree
    along with the haversine metric to calculate distances around the globe. 
    The input is a dataset with 3 columns and the output is a dataframe with 2 columns
    Args:
        uuid (str): unique_sample_identifier
        geo_sample_latitude (int): latitude in radians
        geo_sample_longitude (int): longitude in radians
    Returns:
        dataframe: Dataframe with columns uuid,iaat_code
    """
    distances, indices = ball.query(geo_sample[['geoip_latitude_rad','geoip_longitude_rad']], k = number_final_neightbors)
    return pd.DataFrame(geo_sample['uuid']).join(optd.loc[indices.flatten()]['iata_code'].reset_index(drop=True))

## Testing the implementation
The code runs in 112 ms for 1000 data points. <br>
In one second it can process around 15000 datapoints. <br>
The complete dataset is runned in approximately 58 seconds

In [18]:
%%time
for column in geo_samples[["geoip_latitude", "geoip_longitude"]]:
    rad = np.deg2rad(geo_samples[column].values)
    geo_samples[f'{column}_rad'] = rad
closest_airport_v3(geo_samples[['uuid', 'geoip_latitude_rad','geoip_longitude_rad']])

CPU times: total: 56.9 s
Wall time: 58 s


Unnamed: 0,uuid,iata_code
0,DDEFEBEA-98ED-49EB-A4E7-9D7BFDB7AA0B,MBW
1,DAEF2221-14BE-467B-894A-F101CDCC38E4,AMS
2,31971B3E-2F80-4F8D-86BA-1F2077DF36A2,HND
3,1A29A45C-D560-43D8-ADAB-C2F0AD068FFE,BOD
4,A6EC281B-B8EC-465A-8933-F127472DB0A3,RTM
...,...,...
999995,E54ECDFB-AB67-44A5-B493-618335B1F53C,BRE
999996,2D6F2AEF-2D07-40EC-BFA4-37E5CEA6BB71,OSL
999997,A43AEDEA-9B98-4551-929A-DF9DAB109FFF,BGO
999998,CA5985E2-3507-491C-86DF-DC4A99CB1684,RTM
