In [1]:
# Import Librairies

import plotly.graph_objects as go
import plotly.io as pio
import plotly.express as px

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import  silhouette_score

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
pio.renderers.default = "iframe_connected"

In [2]:
uberdata = pd.read_csv("uber-raw-data-apr14.csv")

In [3]:
# Basic statistics

print("Number of rows : {}".format(uberdata.shape[0]))
print()

print("Display of dataset: ")
display(uberdata.head())
print()

print("Basics statistics: ")
data_desc = uberdata.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*uberdata.isnull().sum()/uberdata.shape[0])
print()

print("Details of dataset with info: ")
display(uberdata.info())
print()


Number of rows : 564516

Display of dataset: 


Unnamed: 0,Date/Time,Lat,Lon,Base
0,4/1/2014 0:11:00,40.769,-73.9549,B02512
1,4/1/2014 0:17:00,40.7267,-74.0345,B02512
2,4/1/2014 0:21:00,40.7316,-73.9873,B02512
3,4/1/2014 0:28:00,40.7588,-73.9776,B02512
4,4/1/2014 0:33:00,40.7594,-73.9722,B02512



Basics statistics: 


Unnamed: 0,Date/Time,Lat,Lon,Base
count,564516,564516.0,564516.0,564516
unique,41999,,,5
top,4/7/2014 20:21:00,,,B02682
freq,97,,,227808
mean,,40.740005,-73.976817,
std,,0.036083,0.050426,
min,,40.0729,-74.7733,
25%,,40.7225,-73.9977,
50%,,40.7425,-73.9848,
75%,,40.7607,-73.97,



Percentage of missing values: 


Date/Time    0.0
Lat          0.0
Lon          0.0
Base         0.0
dtype: float64


Details of dataset with info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 564516 entries, 0 to 564515
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Date/Time  564516 non-null  object 
 1   Lat        564516 non-null  float64
 2   Lon        564516 non-null  float64
 3   Base       564516 non-null  object 
dtypes: float64(2), object(2)
memory usage: 17.2+ MB


None




In [4]:
#convert date/time

uberdata['Date/Time'] = pd.to_datetime(uberdata['Date/Time'])
uberdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 564516 entries, 0 to 564515
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   Date/Time  564516 non-null  datetime64[ns]
 1   Lat        564516 non-null  float64       
 2   Lon        564516 non-null  float64       
 3   Base       564516 non-null  object        
dtypes: datetime64[ns](1), float64(2), object(1)
memory usage: 17.2+ MB


In [5]:
# Focus on the 2nd of April bewteen 17h to 18 h PM

mask = (uberdata['Date/Time'] > '2014-04-02 17:00:00') & (uberdata['Date/Time'] <= '2014-04-02 18:00:00')
onehour=uberdata.loc[mask]
print(onehour)

                 Date/Time      Lat      Lon    Base
1837   2014-04-02 17:01:00  40.7574 -73.9862  B02512
1838   2014-04-02 17:01:00  40.7605 -73.9821  B02512
1839   2014-04-02 17:01:00  40.7518 -73.9758  B02512
1840   2014-04-02 17:02:00  40.7174 -74.0027  B02512
1841   2014-04-02 17:02:00  40.7717 -73.9827  B02512
...                    ...      ...      ...     ...
555160 2014-04-02 17:52:00  40.7495 -73.9887  B02764
555161 2014-04-02 17:54:00  40.7331 -73.9899  B02764
555162 2014-04-02 17:58:00  40.7596 -73.9765  B02764
555163 2014-04-02 17:58:00  40.7616 -73.9728  B02764
555164 2014-04-02 18:00:00  40.7065 -74.0056  B02764

[1589 rows x 4 columns]


In [6]:
# witch to dataframe

onehour=pd.DataFrame(onehour)
onehour.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
1837,2014-04-02 17:01:00,40.7574,-73.9862,B02512
1838,2014-04-02 17:01:00,40.7605,-73.9821,B02512
1839,2014-04-02 17:01:00,40.7518,-73.9758,B02512
1840,2014-04-02 17:02:00,40.7174,-74.0027,B02512
1841,2014-04-02 17:02:00,40.7717,-73.9827,B02512


In [7]:
#Size of the database

onehour.shape

(1589, 4)

In [8]:
# vizualisation of the observations 

fig = px.scatter_mapbox(
        onehour, 
        lat="Lat", 
        lon="Lon",
        color="Base",
        mapbox_style="carto-positron"
)

fig.show()

# We can see a mixture of points on the map according to the indicated base

In [9]:
#pre processing

numeric_features = [1,2] 
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
    ])

X= preprocessor.fit_transform(onehour)
print(X[0:5, :])


[[ 0.43701056 -0.12766065]
 [ 0.54071123 -0.01598008]
 [ 0.24968032  0.15562665]
 [-0.90106257 -0.57710684]
 [ 0.9153717  -0.03232358]]


### Kmeans model ###

In [10]:
# In order to select K optimal cluster, we re going to use two models for Kmeans

# Elbow model

wcss =  []
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    
print(wcss)

[2316.3743716204854, 1505.8144861727228, 1159.2703386915402, 935.4065775342615, 797.5973766655769, 647.1458436088635, 515.1706651656947, 425.9386062492953, 364.8530873544916]


In [11]:
#graph Elbow

fig = px.line(x = range(2,11), y = wcss)
fig.show()


In [12]:
# Silhouette score

s_score = []
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X)
    s_score.append(silhouette_score(X, kmeans.predict(X)))

print(s_score)

[0.7699729901683824, 0.47556460002719847, 0.5177720022570272, 0.5224301461279867, 0.5153453937758494, 0.39674740148001797, 0.41442189072636165, 0.42399834942752, 0.4296520159518935]


In [13]:
# Graph Silhouette score 

fig = px.bar(x = range(2,11), y = s_score)
fig.show()

In [14]:
# Retraining with optimized K = 4

kmeans = KMeans(n_clusters= 4)
kmeans.fit(X)

KMeans(n_clusters=4)

In [15]:
#addition of Cluster_KMeans column and check number of observations per cluster

onehour.loc[:,'Cluster_KMeans'] = kmeans.predict(X)
onehour.head()
display(onehour.head())
print()
onehour["Cluster_KMeans"].value_counts()
display (onehour["Cluster_KMeans"].value_counts())

Unnamed: 0,Date/Time,Lat,Lon,Base,Cluster_KMeans
1837,2014-04-02 17:01:00,40.7574,-73.9862,B02512,0
1838,2014-04-02 17:01:00,40.7605,-73.9821,B02512,0
1839,2014-04-02 17:01:00,40.7518,-73.9758,B02512,0
1840,2014-04-02 17:02:00,40.7174,-74.0027,B02512,1
1841,2014-04-02 17:02:00,40.7717,-73.9827,B02512,0





0    977
1    553
2     35
3     24
Name: Cluster_KMeans, dtype: int64

In [16]:
# Display the clusters on the map

fig = px.scatter_mapbox(onehour, lat='Lat', lon='Lon', opacity=0.5, mapbox_style="carto-positron",color = "Cluster_KMeans",
  zoom=10)
fig.show()

### DBscan model ###

In [17]:
# Instanciate DBSCAN with LOW min_samples

db = DBSCAN(eps=0.2, min_samples=10, metric="manhattan")

db.fit(X)

DBSCAN(eps=0.2, metric='manhattan', min_samples=10)

In [18]:
#Find out how many clusters DBSCAN created.

np.unique(db.labels_)

array([-1,  0,  1,  2,  3,  4,  5])

In [19]:
#Add a new column "cluster" to data_sample where each observations are going to be the label of the corresponding cluster.

onehour["cluster_DBSCAN"] = db.labels_

onehour.head()

Unnamed: 0,Date/Time,Lat,Lon,Base,Cluster_KMeans,cluster_DBSCAN
1837,2014-04-02 17:01:00,40.7574,-73.9862,B02512,0,0
1838,2014-04-02 17:01:00,40.7605,-73.9821,B02512,0,0
1839,2014-04-02 17:01:00,40.7518,-73.9758,B02512,0,0
1840,2014-04-02 17:02:00,40.7174,-74.0027,B02512,1,0
1841,2014-04-02 17:02:00,40.7717,-73.9827,B02512,0,0


In [20]:
#Visualize all the clusters on a map except all the ones that DBSCAN considered as outliers.

fig = px.scatter_mapbox(onehour[onehour.cluster_DBSCAN != -1], lat='Lat', lon='Lon', opacity=0.5, mapbox_style="carto-positron",color = "cluster_DBSCAN",
  zoom=10)
fig.show(renderer="iframe")

## Generalize clustering for each day of April between 17 to 18 hours ##

In [21]:
#Rename first column
uberdata.rename(columns = {'Date/Time':'Timestamp'}, inplace = True)

In [22]:
uberdata['weekday'] = uberdata.Timestamp.dt.day_name()
uberdata['month'] = uberdata.Timestamp.dt.month
uberdata['day'] = uberdata.Timestamp.dt.day
uberdata['hour'] = uberdata.Timestamp.dt.hour
uberdata['minute'] = uberdata.Timestamp.dt.minute
uberdata


Unnamed: 0,Timestamp,Lat,Lon,Base,weekday,month,day,hour,minute
0,2014-04-01 00:11:00,40.7690,-73.9549,B02512,Tuesday,4,1,0,11
1,2014-04-01 00:17:00,40.7267,-74.0345,B02512,Tuesday,4,1,0,17
2,2014-04-01 00:21:00,40.7316,-73.9873,B02512,Tuesday,4,1,0,21
3,2014-04-01 00:28:00,40.7588,-73.9776,B02512,Tuesday,4,1,0,28
4,2014-04-01 00:33:00,40.7594,-73.9722,B02512,Tuesday,4,1,0,33
...,...,...,...,...,...,...,...,...,...
564511,2014-04-30 23:22:00,40.7640,-73.9744,B02764,Wednesday,4,30,23,22
564512,2014-04-30 23:26:00,40.7629,-73.9672,B02764,Wednesday,4,30,23,26
564513,2014-04-30 23:31:00,40.7443,-73.9889,B02764,Wednesday,4,30,23,31
564514,2014-04-30 23:32:00,40.6756,-73.9405,B02764,Wednesday,4,30,23,32


In [23]:

mask = (uberdata['hour'] >= 17) & (uberdata['hour'] < 18)
day17_18=uberdata.loc[mask]
print(day17_18)

                 Timestamp      Lat      Lon    Base    weekday  month  day  \
634    2014-04-01 17:00:00  40.7591 -73.9670  B02512    Tuesday      4    1   
635    2014-04-01 17:00:00  40.7701 -73.9625  B02512    Tuesday      4    1   
636    2014-04-01 17:02:00  40.7789 -73.9559  B02512    Tuesday      4    1   
637    2014-04-01 17:02:00  40.7789 -73.9559  B02512    Tuesday      4    1   
638    2014-04-01 17:02:00  40.7330 -73.9824  B02512    Tuesday      4    1   
...                    ...      ...      ...     ...        ...    ...  ...   
564281 2014-04-30 17:51:00  40.7587 -73.9816  B02764  Wednesday      4   30   
564282 2014-04-30 17:54:00  40.7617 -73.9788  B02764  Wednesday      4   30   
564283 2014-04-30 17:54:00  40.7640 -73.9558  B02764  Wednesday      4   30   
564284 2014-04-30 17:57:00  40.7401 -74.0021  B02764  Wednesday      4   30   
564285 2014-04-30 17:58:00  40.7498 -73.9733  B02764  Wednesday      4   30   

        hour  minute  
634       17       0  
635  

In [24]:
#pre processing

numeric_features = [1,2] 
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
    ])

X= preprocessor.fit_transform(day17_18)
print(X[0:5, :])


[[ 0.51122456  0.27436615]
 [ 0.85223172  0.3764301 ]
 [ 1.12503745  0.52612389]
 [ 1.12503745  0.52612389]
 [-0.29789243 -0.07491937]]


### Kmeans model ###

In [25]:
# In order to select K optimal cluster, we re going to use two models for Kmeans

# Elbow model

wcss =  []
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    
print(wcss)

[64356.30739192415, 42153.10091499711, 33510.45261166789, 28130.663337466773, 21838.748485648266, 18414.59541292935, 15452.863690764363, 13653.441226539031, 12387.984178718834]


In [26]:
#graph Elbow

fig = px.line(x = range(2,11), y = wcss)
fig.show()



In [27]:
# Silhouette score

s_score = []
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X)
    s_score.append(silhouette_score(X, kmeans.predict(X)))

print(s_score)

[0.770673004661724, 0.45217615220636037, 0.4930649838571283, 0.499759319687176, 0.4812023523980493, 0.4844629445145224, 0.4576302531786767, 0.4178092846090457, 0.422417441486769]


In [28]:
# Graph Silhouette score 

fig = px.bar(x = range(2,11), y = s_score)
fig.show()

In [29]:
# Retraining with optimized K = 4


kmeans = KMeans(n_clusters= 4)
kmeans.fit(X)

KMeans(n_clusters=4)

In [30]:
#addition of Cluster_KMeans column and check number of observations per cluster

day17_18.loc[:,'Cluster_KMeans'] = kmeans.predict(X)
day17_18.head()
display(day17_18.head())
print()
day17_18["Cluster_KMeans"].value_counts()
display (day17_18["Cluster_KMeans"].value_counts())



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Timestamp,Lat,Lon,Base,weekday,month,day,hour,minute,Cluster_KMeans
634,2014-04-01 17:00:00,40.7591,-73.967,B02512,Tuesday,4,1,17,0,1
635,2014-04-01 17:00:00,40.7701,-73.9625,B02512,Tuesday,4,1,17,0,1
636,2014-04-01 17:02:00,40.7789,-73.9559,B02512,Tuesday,4,1,17,2,1
637,2014-04-01 17:02:00,40.7789,-73.9559,B02512,Tuesday,4,1,17,2,1
638,2014-04-01 17:02:00,40.733,-73.9824,B02512,Tuesday,4,1,17,2,0





1    27655
0    15377
3     1510
2      933
Name: Cluster_KMeans, dtype: int64

In [31]:
# Display the clusters on the map

fig = px.scatter_mapbox(day17_18, lat='Lat', lon='Lon', opacity=0.5, mapbox_style="carto-positron",color = "Cluster_KMeans",
  zoom=10)
fig.show()

### DBscan model ###

In [32]:
# Instanciate DBSCAN with LOW min_samples

db = DBSCAN(eps=0.2, min_samples=30, metric="manhattan")

db.fit(X)

DBSCAN(eps=0.2, metric='manhattan', min_samples=30)

In [33]:
#Find out how many clusters DBSCAN created.

np.unique(db.labels_)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [34]:
#Add a new column "cluster" to data_sample where each observations are going to be the label of the corresponding cluster.
day17_18["cluster_DBSCAN"] = db.labels_

day17_18.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Timestamp,Lat,Lon,Base,weekday,month,day,hour,minute,Cluster_KMeans,cluster_DBSCAN
634,2014-04-01 17:00:00,40.7591,-73.967,B02512,Tuesday,4,1,17,0,1,0
635,2014-04-01 17:00:00,40.7701,-73.9625,B02512,Tuesday,4,1,17,0,1,0
636,2014-04-01 17:02:00,40.7789,-73.9559,B02512,Tuesday,4,1,17,2,1,0
637,2014-04-01 17:02:00,40.7789,-73.9559,B02512,Tuesday,4,1,17,2,1,0
638,2014-04-01 17:02:00,40.733,-73.9824,B02512,Tuesday,4,1,17,2,0,0


In [35]:
#Visualize all the clusters on a map except all the ones that DBSCAN considered as outliers.

fig = px.scatter_mapbox(day17_18[day17_18.cluster_DBSCAN != -1], lat='Lat', lon='Lon', opacity=0.5, mapbox_style="carto-positron",color = "cluster_DBSCAN",
  zoom=10)
fig.show(renderer="iframe")

In [36]:
#Conclusion
# Kmeans is not appropriate for non-globular clusters
# DBscan is based on the density calculus but more sensible to the noise
# In my opinion, Dbscan shows only 1 cluster in the center of NY and covers a very large hot zone, whereas Kmeans 
# is more appriopriate for the center but not at all in the suburb (too spread out) due to the shape of observations




