In [11]:
# Librairies

! pip install plotly
import plotly.graph_objects as go
import plotly.io as pio
import plotly.express as px

import pandas as pd
import numpy as np


from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import  silhouette_score

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
pio.renderers.default = "iframe_connected"



In [22]:
uberdata = pd.read_csv("uber-raw-data-apr14.csv")

In [17]:
uberdata

Unnamed: 0,Date/Time,Lat,Lon,Base
0,4/1/2014 0:11:00,40.7690,-73.9549,B02512
1,4/1/2014 0:17:00,40.7267,-74.0345,B02512
2,4/1/2014 0:21:00,40.7316,-73.9873,B02512
3,4/1/2014 0:28:00,40.7588,-73.9776,B02512
4,4/1/2014 0:33:00,40.7594,-73.9722,B02512
...,...,...,...,...
564511,4/30/2014 23:22:00,40.7640,-73.9744,B02764
564512,4/30/2014 23:26:00,40.7629,-73.9672,B02764
564513,4/30/2014 23:31:00,40.7443,-73.9889,B02764
564514,4/30/2014 23:32:00,40.6756,-73.9405,B02764


In [18]:
# Basic statistics

print("Number of rows : {}".format(uberdata.shape[0]))
print()

print("Display of dataset: ")
display(uberdata.head())
print()

print("Basics statistics: ")
data_desc = uberdata.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*uberdata.isnull().sum()/uberdata.shape[0])
print()

print("Details of dataset with info: ")
display(uberdata.info())
print()


Number of rows : 564516

Display of dataset: 


Unnamed: 0,Date/Time,Lat,Lon,Base
0,4/1/2014 0:11:00,40.769,-73.9549,B02512
1,4/1/2014 0:17:00,40.7267,-74.0345,B02512
2,4/1/2014 0:21:00,40.7316,-73.9873,B02512
3,4/1/2014 0:28:00,40.7588,-73.9776,B02512
4,4/1/2014 0:33:00,40.7594,-73.9722,B02512



Basics statistics: 


Unnamed: 0,Date/Time,Lat,Lon,Base
count,564516,564516.0,564516.0,564516
unique,41999,,,5
top,4/7/2014 20:21:00,,,B02682
freq,97,,,227808
mean,,40.740005,-73.976817,
std,,0.036083,0.050426,
min,,40.0729,-74.7733,
25%,,40.7225,-73.9977,
50%,,40.7425,-73.9848,
75%,,40.7607,-73.97,



Percentage of missing values: 


Date/Time    0.0
Lat          0.0
Lon          0.0
Base         0.0
dtype: float64


Details of dataset with info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 564516 entries, 0 to 564515
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Date/Time  564516 non-null  object 
 1   Lat        564516 non-null  float64
 2   Lon        564516 non-null  float64
 3   Base       564516 non-null  object 
dtypes: float64(2), object(2)
memory usage: 17.2+ MB


None




In [23]:
# discard useless columns and convert date time in datetime format
 
useless_cols = ['Base']

uberdata = uberdata.drop(useless_cols, axis=1)
print(uberdata.head())

#convert date/time
uberdata['Date/Time'] = pd.to_datetime(uberdata['Date/Time'])


          Date/Time      Lat      Lon
0  4/1/2014 0:11:00  40.7690 -73.9549
1  4/1/2014 0:17:00  40.7267 -74.0345
2  4/1/2014 0:21:00  40.7316 -73.9873
3  4/1/2014 0:28:00  40.7588 -73.9776
4  4/1/2014 0:33:00  40.7594 -73.9722


In [24]:
# Create 2 columns for splitting day and hour

uberdata['date'] = uberdata['Date/Time'].dt.date
uberdata['time'] = uberdata['Date/Time'].dt.time


In [25]:
uberdata['date'] = pd.to_datetime(uberdata['date'])
uberdata

Unnamed: 0,Date/Time,Lat,Lon,date,time
0,2014-04-01 00:11:00,40.7690,-73.9549,2014-04-01,00:11:00
1,2014-04-01 00:17:00,40.7267,-74.0345,2014-04-01,00:17:00
2,2014-04-01 00:21:00,40.7316,-73.9873,2014-04-01,00:21:00
3,2014-04-01 00:28:00,40.7588,-73.9776,2014-04-01,00:28:00
4,2014-04-01 00:33:00,40.7594,-73.9722,2014-04-01,00:33:00
...,...,...,...,...,...
564511,2014-04-30 23:22:00,40.7640,-73.9744,2014-04-30,23:22:00
564512,2014-04-30 23:26:00,40.7629,-73.9672,2014-04-30,23:26:00
564513,2014-04-30 23:31:00,40.7443,-73.9889,2014-04-30,23:31:00
564514,2014-04-30 23:32:00,40.6756,-73.9405,2014-04-30,23:32:00


In [26]:
# Focus on the second of April bewteen 17h to 18 h AM

mask = (uberdata['Date/Time'] > '2014-04-02 17:00:00') & (uberdata['Date/Time'] <= '2014-04-02 18:00:00')
onehour=uberdata.loc[mask]
print(onehour)

                 Date/Time      Lat      Lon       date      time
1837   2014-04-02 17:01:00  40.7574 -73.9862 2014-04-02  17:01:00
1838   2014-04-02 17:01:00  40.7605 -73.9821 2014-04-02  17:01:00
1839   2014-04-02 17:01:00  40.7518 -73.9758 2014-04-02  17:01:00
1840   2014-04-02 17:02:00  40.7174 -74.0027 2014-04-02  17:02:00
1841   2014-04-02 17:02:00  40.7717 -73.9827 2014-04-02  17:02:00
...                    ...      ...      ...        ...       ...
555160 2014-04-02 17:52:00  40.7495 -73.9887 2014-04-02  17:52:00
555161 2014-04-02 17:54:00  40.7331 -73.9899 2014-04-02  17:54:00
555162 2014-04-02 17:58:00  40.7596 -73.9765 2014-04-02  17:58:00
555163 2014-04-02 17:58:00  40.7616 -73.9728 2014-04-02  17:58:00
555164 2014-04-02 18:00:00  40.7065 -74.0056 2014-04-02  18:00:00

[1589 rows x 5 columns]


In [27]:
# Transformation in panda dataframe

onehour=pd.DataFrame(onehour)
onehour.head()

Unnamed: 0,Date/Time,Lat,Lon,date,time
1837,2014-04-02 17:01:00,40.7574,-73.9862,2014-04-02,17:01:00
1838,2014-04-02 17:01:00,40.7605,-73.9821,2014-04-02,17:01:00
1839,2014-04-02 17:01:00,40.7518,-73.9758,2014-04-02,17:01:00
1840,2014-04-02 17:02:00,40.7174,-74.0027,2014-04-02,17:02:00
1841,2014-04-02 17:02:00,40.7717,-73.9827,2014-04-02,17:02:00


In [28]:
#Size of the database

onehour.shape

(1589, 5)

In [29]:
#pre processing

numeric_features = [1,2] 
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_features = [] 
categorical_transformer = Pipeline(
    steps=[
    ('encoder', OneHotEncoder(drop='first')) 
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X= preprocessor.fit_transform(onehour)
print(X[0:5, :])
print()


[[ 0.43701056 -0.12766065]
 [ 0.54071123 -0.01598008]
 [ 0.24968032  0.15562665]
 [-0.90106257 -0.57710684]
 [ 0.9153717  -0.03232358]]



### Kmeans model ###

In [34]:
# Elbow model

wcss =  []
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    
print(wcss)

[2316.3743716204854, 1505.8144861727226, 1159.2154210286922, 935.4065775342615, 765.1910662709124, 625.4681490361298, 510.49319649754705, 426.0039479978043, 363.45487943514905]


In [35]:
#graph Elbow
fig = px.line(x = range(2,11), y = wcss)
fig.show()

In [36]:
# Silhouette score

s_score = []
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X)
    s_score.append(silhouette_score(X, kmeans.predict(X)))

print(s_score)

[0.7699729901683824, 0.47556460002719847, 0.5177720022570272, 0.5224301461279867, 0.5152663780084531, 0.5182366823751762, 0.417701843156711, 0.42399834942752, 0.4302639371822176]


In [37]:
# Graph Silhouette score 

fig = px.bar(x = range(2,11), y = s_score)
fig.show()

In [38]:
# Retraining with optimized K 

kmeans = KMeans(n_clusters= 6)
kmeans.fit(X)

KMeans(n_clusters=6)

In [39]:
#addition of Cluster_KMeans column

onehour.loc[:,'Cluster_KMeans'] = kmeans.predict(X)
onehour.head()
display(onehour.head())
print()
onehour["Cluster_KMeans"].value_counts()
display (onehour["Cluster_KMeans"].value_counts())
print()
onehour

Unnamed: 0,Date/Time,Lat,Lon,date,time,Cluster_KMeans
1837,2014-04-02 17:01:00,40.7574,-73.9862,2014-04-02,17:01:00,2
1838,2014-04-02 17:01:00,40.7605,-73.9821,2014-04-02,17:01:00,2
1839,2014-04-02 17:01:00,40.7518,-73.9758,2014-04-02,17:01:00,2
1840,2014-04-02 17:02:00,40.7174,-74.0027,2014-04-02,17:02:00,5
1841,2014-04-02 17:02:00,40.7717,-73.9827,2014-04-02,17:02:00,2





2    749
0    502
5    273
1     35
3     24
4      6
Name: Cluster_KMeans, dtype: int64




Unnamed: 0,Date/Time,Lat,Lon,date,time,Cluster_KMeans
1837,2014-04-02 17:01:00,40.7574,-73.9862,2014-04-02,17:01:00,2
1838,2014-04-02 17:01:00,40.7605,-73.9821,2014-04-02,17:01:00,2
1839,2014-04-02 17:01:00,40.7518,-73.9758,2014-04-02,17:01:00,2
1840,2014-04-02 17:02:00,40.7174,-74.0027,2014-04-02,17:02:00,5
1841,2014-04-02 17:02:00,40.7717,-73.9827,2014-04-02,17:02:00,2
...,...,...,...,...,...,...
555160,2014-04-02 17:52:00,40.7495,-73.9887,2014-04-02,17:52:00,0
555161,2014-04-02 17:54:00,40.7331,-73.9899,2014-04-02,17:54:00,0
555162,2014-04-02 17:58:00,40.7596,-73.9765,2014-04-02,17:58:00,2
555163,2014-04-02 17:58:00,40.7616,-73.9728,2014-04-02,17:58:00,2


In [40]:
# Display the clusters on the map

fig = px.scatter_mapbox(onehour, lat='Lat', lon='Lon', opacity=0.5, mapbox_style="carto-positron",color = "Cluster_KMeans",
  zoom=10)
fig.show()

### DBscan model ###

In [45]:
# Instanciate DBSCAN with LOW min_samples

db = DBSCAN(eps=0.2, min_samples=5, metric="manhattan")

db.fit(X)

DBSCAN(eps=0.2, metric='manhattan')

In [48]:
#Find out how many clusters DBSCAN created.

np.unique(db.labels_)

array([-1,  0,  1,  2,  3,  4,  5])

In [51]:
#Add a new column "cluster" to data_sample where each observations are going to be the label of the corresponding cluster.

onehour["cluster_DBSCAN"] = db.labels_

onehour.head()

Unnamed: 0,Date/Time,Lat,Lon,date,time,Cluster_KMeans,cluster_DBSCAN
1837,2014-04-02 17:01:00,40.7574,-73.9862,2014-04-02,17:01:00,2,0
1838,2014-04-02 17:01:00,40.7605,-73.9821,2014-04-02,17:01:00,2,0
1839,2014-04-02 17:01:00,40.7518,-73.9758,2014-04-02,17:01:00,2,0
1840,2014-04-02 17:02:00,40.7174,-74.0027,2014-04-02,17:02:00,5,0
1841,2014-04-02 17:02:00,40.7717,-73.9827,2014-04-02,17:02:00,2,0


In [53]:
#Visualize all the clusters on a map except all the ones that DBSCAN considered as outliers.

fig = px.scatter_mapbox(onehour, lat='Lat', lon='Lon', opacity=0.5, mapbox_style="carto-positron",color = "cluster_DBSCAN",
  zoom=10)
fig.show(renderer="iframe")

In [26]:
# conclusion (to be filled in)