In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
import sklearn.utils
from sklearn.preprocessing import StandardScaler
from keplergl import KeplerGl
import geopandas as gpd
import math
import datetime

from st_dbscan import ST_DBSCAN

In [2]:
# We switched to fr data
# bl = pd.read_csv('../../../../og_data/extract_clean_with_country.csv', index_col='Unnamed: 0', parse_dates=['ts'], low_memory=False)

fr = pd.DataFrame()

flights_to_add = ['HBAL102-N235LB', 'HBAL116-N252LB', 'HBAL131-N271LB', 'HBAL132-N211LB', \
                  'HBAL136-N238LB']

for flight in flights_to_add:
    
    aux = pd.read_csv('../../../../og_data/' + flight + '.csv', parse_dates=['ts'], low_memory=False)
    
    fr = fr.append(aux)

fr['trip_id'] = fr['aircraft_id']

bl = fr

In [3]:
#Visualize with Kepler
#Create a basemap 
map = KeplerGl(height=700, width=800)#show the map
map


User Guide: https://github.com/keplergl/kepler.gl/blob/master/docs/keplergl-jupyter/user-guide.md


KeplerGl(height=700)

In [4]:
'First attempt with only DBSCAN'
'Third attempt with only DBSCAN + filters'

'Third attempt with only DBSCAN + filters'

In [5]:
# First let's study the frequency of records
freq_df = pd.DataFrame()

freq_df = bl.loc[bl.registration_id == 'N238LB'].copy()
freq_df = freq_df.sort_values('ts')

freq_df['time_since_last_record'] = freq_df['ts'] - freq_df['ts'].shift()
freq_df['time_since_last_record'] = freq_df['time_since_last_record'].fillna(0)

# Go for the median since it is less sensitive to outliers
freq_median = freq_df['time_since_last_record'].median()
print('Median is: ' + str(freq_median))


stay_duration = pd.Timedelta('1day')

# Min number of samples there should be for the duration of what we consider a stay
min_stay_records = stay_duration / freq_median



Median is: 0 days 00:01:01


  


In [6]:
min_stay_records

1416.3934426229507

In [11]:
#sample_trip = pd.DataFrame()
#sample_trip = bl.loc[bl.trip_id == 'HBAL027-BA1462'].copy()


trips_to_study = ['N238LB']
#['N235LB', 'N252LB', 'N271LB', 'N211LB', 'N238LB']

'''['HBAL438-BA10EF', 'HBAL11-BA116D', 'HBAL14-BA116F', \
                     'HBAL17-BA1182', 'HBAL464-BA115A', 'HBAL465-BA115B', \
                     'HBAL390-BA1114', 'HBAL19-BA1190', 'HBAL0329-A27A9E', \
                     'HBAL470-BA118E', 'HBAL448-BA1162']
                     
                     
                     
 ['HBAL026-BA144D','HBAL027-BA1462','HBAL029-BA1440','HBAL034-BA1475', \
                            'HBAL036-BA147A','HBAL045-BA14B7','HBAL046-A20D33','HBAL047-A22EA2', \
                            'HBAL049-A22734','HBAL076-A1D836','HBAL085-A0D299','HBAL086-A19E24', \
                            'HBAL095-A2B654','HBAL097-A16317','HBAL187-A2E18A','HBAL187-BA13C2', \
                            'HBAL209-BA1493','HBAL218-BA13EB','HBAL231-BA1460' ]
'''

trips_with_cluster = pd.DataFrame()

for trip in trips_to_study:
    
    trip_df_temp = bl.loc[bl.registration_id == trip].copy()
    
    trip_df_temp_clus = StandardScaler().fit_transform(trip_df_temp[['lat', 'lon']])
    db = DBSCAN(eps=0.05, min_samples=1500, metric='euclidean').fit(trip_df_temp_clus)

    trip_df_temp['cluster'] = db.labels_
    print(str(trip) + ': ' + str(np.unique(db.labels_)))
    
    trips_with_cluster = trips_with_cluster.append(trip_df_temp)
    

N238LB: [-1  0  1  2  3  4  5  6  7  8  9 10 11]


In [12]:
gdf = gpd.GeoDataFrame(trips_with_cluster, \
                       geometry=gpd.points_from_xy(trips_with_cluster.lon, trips_with_cluster.lat))

map.add_data(data=gdf, name="loon_traces")

In [None]:
# Save to file
trips_with_cluster.to_csv('trips_with_cluster.csv', index=False, encoding='utf-8-sig')

In [None]:
'Second attempt with spatiotemporal DBSCAN' DESCARTADO

In [None]:
DESCARTADO


trips_to_study = ['N271LB']


'''
['HBAL026-BA144D','HBAL027-BA1462','HBAL029-BA1440','HBAL034-BA1475', \
                            'HBAL036-BA147A','HBAL045-BA14B7','HBAL046-A20D33','HBAL047-A22EA2', \
                            'HBAL049-A22734','HBAL076-A1D836','HBAL085-A0D299','HBAL086-A19E24', \
                            'HBAL095-A2B654','HBAL097-A16317','HBAL187-A2E18A','HBAL187-BA13C2', \
                            'HBAL209-BA1493','HBAL218-BA13EB','HBAL231-BA1460' ]

['HBAL072', 'HBAL102', 'HBAL131', 'HBAL132', \
                  'HBAL136', 'HBAL116']
'''


    
trips_with_cluster = pd.DataFrame()



for trip in trips_to_study:
    print(str(trip))
    trip_df = bl.loc[(bl.registration_id == trip)].copy()

    # Normalize the data
    trip_df['lat_normal'] = (trip_df['lat'] - trip_df['lat'].min()) / (trip_df['lat'].max() - trip_df['lat'].min())
    trip_df['lon_normal'] = (trip_df['lon'] - trip_df['lon'].min()) / (trip_df['lon'].max() - trip_df['lon'].min())
    trip_df['ts_delta'] = (trip_df['ts'] - trip_df['ts'].min())  / np.timedelta64(1,'D')

    # transform to numpy array
    data = trip_df.loc[:, ['ts_delta','lat_normal','lon_normal']].values

    # fit
    st_dbscan = ST_DBSCAN(eps1 = 0.03, eps2 = 5, min_samples = 1400) 
    st_dbscan.fit(data)
    #st_dbscan.fit_frame_split(data, frame_size = 50)

    # Add cluster data to the df
    trip_df['cluster'] = st_dbscan.labels
    print(str(trip) + ': ' + str(np.unique(st_dbscan.labels)))

    trips_with_cluster = trips_with_cluster.append(trip_df)



N271LB


In [55]:
gdf = gpd.GeoDataFrame(trips_with_cluster, \
                       geometry=gpd.points_from_xy(trips_with_cluster.lon, trips_with_cluster.lat))

map.add_data(data=gdf, name="loon_traces")