In [1]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import collections
import matplotlib.pyplot as plt
import seaborn as sns

from pyproj import Proj, transform

from sklearn.metrics.pairwise import euclidean_distances
from sklearn.cluster import DBSCAN

import bokeh
import bokeh.plotting as plotting
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.tile_providers import CARTODBPOSITRON
plotting.output_notebook()

sns.set_style('whitegrid')

# Problem definition

Cluster US regions based on flight delay time
Dataset for clustering is "FlightsProcessedShort.csv" and "airports.csv".

https://drive.google.com/open?id=1ZUHNIgxMqRbeDUS4PE2W-vVss_BZMA8G

# Load the data

In [3]:
df1=pd.read_csv("FlightsProcessed.csv")
print(df1.head())
df2=pd.read_csv("airports.csv")
print(df2.head())
# #input
# df = pd.read_csv('data/interventionscitoyendo.csv', encoding='latin_1')
# df['DATE'] = pd.to_datetime(df['DATE'])
# print(df.columns)
# print(df['CATEGORIE'].value_counts())
# df.head()

   Unnamed: 0  Unnamed: 0.1  Month  DayofMonth  DayOfWeek  SchedDepTime  \
0           0             0      1           3          4          20.0   
1           1             1      1           3          4           8.0   
2           2             2      1           3          4           7.0   
3           3             3      1           3          4          18.0   
4           4             4      1           3          4          20.0   

  Carrier TailNum  DepDelay Origin  Distance  
0      WN  N712SW       8.0    IAD         2  
1      WN  N772SW      19.0    IAD         2  
2      WN  N428WN       8.0    IND         2  
3      WN  N464WN      34.0    IND         2  
4      WN  N726SW      25.0    IND         2  
  iata               airport              city state country        lat  \
0  00M              Thigpen        Bay Springs    MS     USA  31.953765   
1  00R  Livingston Municipal        Livingston    TX     USA  30.685861   
2  00V           Meadow Lake  Colorado Spr

In [30]:
# extract from full database of airports those that are included in our dataset

df3=df1.groupby("Origin")["DepDelay"].mean().reset_index(name ="DepDelay")
df3.head()

for i in range(len(df3['Origin'])):
    for j in range(len(df2['iata'])):
                   if df3.loc[i,'Origin']==df2.loc[j,'iata']:
                       df3.loc[i,'lat']=df2.loc[j,'lat']
                       df3.loc[i,'long']=df2.loc[j,'long']

df3.head()


Unnamed: 0,Origin,DepDelay,lat,long
0,ABE,59.461538,40.652363,-75.440402
1,ABI,66.469388,32.41132,-99.681897
2,ABQ,36.358254,35.040222,-106.609194
3,ABY,49.44863,31.535515,-84.194473
4,ACK,66.47449,41.253052,-70.060181


# Feature Engineering 

In [31]:
# # adapt X and Y to the visualization
# df3['X'] = df3.apply(lambda x: transform(Proj(init='epsg:4326'), Proj(init='epsg:3857'), x['long'], x['lat'])[1], axis=1)
# df3['Y'] = df3.apply(lambda x: transform(Proj(init='epsg:4326'), Proj(init='epsg:3857'), x['long'], x['lat'])[0], axis=1)

# X_columns = ['lat', 'long']
# df3 = df3[X_columns]

df3['X'] = df3.apply(lambda x: transform(Proj(init='epsg:4326'), Proj(init='epsg:3857'), x['long'], x['lat'])[1], axis=1)
df3['Y'] = df3.apply(lambda x: transform(Proj(init='epsg:4326'), Proj(init='epsg:3857'), x['long'], x['lat'])[0], axis=1)

X_columns = ['X', 'Y']
df3 = df3[X_columns]

# Model Training

In [32]:
df3.head()

Unnamed: 0,X,Y
0,4961200.0,-8397987.0
1,3817425.0,-11096540.0
2,4169349.0,-11867680.0
3,3702493.0,-9372486.0
4,5049739.0,-7799064.0


In [77]:
model = DBSCAN(eps=150000.0, min_samples=4)
model.fit(df3[['X', 'Y']])

cluster_labels = model.labels_
n_clusters = len(set(cluster_labels))
print(collections.Counter(cluster_labels))

df3['cluster'] = cluster_labels

Counter({-1: 217, 6: 14, 0: 9, 4: 8, 2: 8, 8: 7, 9: 6, 5: 5, 11: 5, 1: 4, 3: 4, 13: 4, 7: 4, 10: 4, 12: 4})


In [83]:
p = figure(y_range=(9641788.0, 1751788.0), x_range=(-14152883, -8252883))
p.add_tile(CARTODBPOSITRON)

latitude  = list(df3[df3['cluster']>-1]['X'].values)
longitude = list(df3[df3['cluster']>-1]['Y'].values)

colormap = list(bokeh.palettes.viridis(n_clusters))
colors = [colormap[x] for x in df3[df3['cluster']>-1]['cluster']]
source = ColumnDataSource(data=dict(longitude=longitude, latitude=latitude))
p.circle(x=longitude, y=latitude, color=colors, fill_alpha=0.2, size=5)
show(p)

# Model Evaluation

In [80]:
# Inter-Cluster
centroids = []
for cluster in sorted(set(model.labels_)):
    centroids.append(df3[df3['cluster']==cluster][X_columns].mean().values)
distances = []
for c1 in centroids:
    for c2 in centroids:
        distances.append(euclidean_distances(c1.reshape(-1, 1), c2.reshape(-1, 1))[0][0])
print('Inter Cluster distance', np.mean(distances))

# Intra-Cluster
distances = []
for cluster in sorted(set(model.labels_)):
    df_filter = df3[df3['cluster']==cluster]
    centroid = df_filter[X_columns].mean().values
    for k, v in df_filter[X_columns].iterrows():
        distances.append(euclidean_distances(centroid.reshape(-1, 1), v.values.reshape(-1, 1))[0][0])
print('Intra Cluster distance', np.mean(distances))

# Inertia
distances = []
for cluster in sorted(set(model.labels_)):
    df_filter = df3[df3['cluster']==cluster]
    centroid = df_filter[X_columns].mean().values
    for k, v in df_filter[X_columns].iterrows():
        distances.append(euclidean_distances(centroid.reshape(1, -1), v.values.reshape(1, -1), squared=True)[0][0])
print('Inertia', np.sum(distances))

Inter Cluster distance 935630.2560916647
Intra Cluster distance 779798.7175865289
Inertia 1830782337968314.5


<font color=blue>***In fact, present DBSCAN clustering algorithm that we applied for these data does not bring any significant information - we made simple clustering of airports by its geolocation. Further analysis could provide information on average delay time in each cluster or flight prices. However, it is a subject of future analysis and will not be done in the frame of current project***</font>