In [1]:
import pandas as pd
!pip install plotly -q

In [2]:
df = pd.read_csv('uber-raw-data-aug14.csv')
df.shape

(829275, 4)

In [3]:
#Reduire le dataset pour aller plus vite (sinon trop lourd)
df = df.sample(10000)

In [4]:
#Ajouter des colonnes Mois / Jour et Heure de la journée
from datetime import datetime
df['Date/Time'] = pd.to_datetime(df['Date/Time'])

df['Date'] = df['Date/Time'].apply(lambda x : x.date())
df['Month'] = df['Date/Time'].apply(lambda x : x.month)
df['Day'] = df['Date/Time'].apply(lambda x : x.weekday())
df['Hour'] = df['Date/Time'].apply(lambda x : x.hour)

df.head()

Unnamed: 0,Date/Time,Lat,Lon,Base,Date,Month,Day,Hour
601224,2014-08-31 13:21:00,40.7326,-73.9912,B02617,2014-08-31,8,6,13
535292,2014-08-24 22:58:00,40.6885,-73.9806,B02617,2014-08-24,8,6,22
27194,2014-08-27 01:40:00,40.7493,-73.9798,B02512,2014-08-27,8,2,1
11432,2014-08-11 09:33:00,40.752,-73.9857,B02512,2014-08-11,8,0,9
411703,2014-08-14 07:36:00,40.7793,-73.9466,B02617,2014-08-14,8,3,7


In [5]:
#Arrondir les latitude et longitude pour pouvoir ensuite faire une map avec des points plus denses (approximation des positions)

df['Lat_round'] = df['Lat'].round(2)
df['Lon_round'] = df['Lon'].round(2)
df.head()

Unnamed: 0,Date/Time,Lat,Lon,Base,Date,Month,Day,Hour,Lat_round,Lon_round
601224,2014-08-31 13:21:00,40.7326,-73.9912,B02617,2014-08-31,8,6,13,40.73,-73.99
535292,2014-08-24 22:58:00,40.6885,-73.9806,B02617,2014-08-24,8,6,22,40.69,-73.98
27194,2014-08-27 01:40:00,40.7493,-73.9798,B02512,2014-08-27,8,2,1,40.75,-73.98
11432,2014-08-11 09:33:00,40.752,-73.9857,B02512,2014-08-11,8,0,9,40.75,-73.99
411703,2014-08-14 07:36:00,40.7793,-73.9466,B02617,2014-08-14,8,3,7,40.78,-73.95


In [6]:
# Ajouter une colonne pour voir lesquelles sont sur des latitudes-longitudes moyennes 

df['Lat_Lon'] = ((df['Lat_round']).astype(str) + (df['Lon_round']).astype(str))
df.head()

Unnamed: 0,Date/Time,Lat,Lon,Base,Date,Month,Day,Hour,Lat_round,Lon_round,Lat_Lon
601224,2014-08-31 13:21:00,40.7326,-73.9912,B02617,2014-08-31,8,6,13,40.73,-73.99,40.73-73.99
535292,2014-08-24 22:58:00,40.6885,-73.9806,B02617,2014-08-24,8,6,22,40.69,-73.98,40.69-73.98
27194,2014-08-27 01:40:00,40.7493,-73.9798,B02512,2014-08-27,8,2,1,40.75,-73.98,40.75-73.98
11432,2014-08-11 09:33:00,40.752,-73.9857,B02512,2014-08-11,8,0,9,40.75,-73.99,40.75-73.99
411703,2014-08-14 07:36:00,40.7793,-73.9466,B02617,2014-08-14,8,3,7,40.78,-73.95,40.78-73.95


In [7]:
#Grouper les colonnes afin d'obtenir le nombre de demandes par position approximées

new_df = df[['Lat_round','Lon_round','Date']].groupby(['Lat_round','Lon_round']).count().sort_values(by = 'Date', ascending = False).reset_index()
new_df = new_df.rename(columns={"Date":"number_of_demand"})
new_df.head()

Unnamed: 0,Lat_round,Lon_round,number_of_demand
0,40.76,-73.97,406
1,40.76,-73.98,400
2,40.75,-73.99,342
3,40.74,-73.99,339
4,40.73,-74.0,333


In [8]:
#Nombre de demandes journalières brutes

import plotly.express as px
import matplotlib.pyplot as plt

fig = px.scatter_mapbox(new_df, lat="Lat_round", lon="Lon_round", mapbox_style="carto-positron", zoom=10, size='number_of_demand' ,color='number_of_demand', color_continuous_scale=px.colors.diverging.RdYlGn[::-1])
fig.show("iframe_connected")

In [9]:
new_df_1 = df.groupby(['Hour','Lat_round','Lon_round']).count().sort_values(by = 'Hour', ascending = True).reset_index()
new_df_1['number_of_demand'] = new_df_1['Lat']
new_df_1.head()

Unnamed: 0,Hour,Lat_round,Lon_round,Date/Time,Lat,Lon,Base,Date,Month,Day,Lat_Lon,number_of_demand
0,0,40.58,-73.96,1,1,1,1,1,1,1,1,1
1,0,40.76,-73.97,3,3,3,3,3,3,3,3,3
2,0,40.76,-73.98,6,6,6,6,6,6,6,6,6
3,0,40.76,-73.99,5,5,5,5,5,5,5,5,5
4,0,40.75,-73.98,4,4,4,4,4,4,4,4,4


In [10]:
#Commandes à l'heure

fig = px.scatter_mapbox(new_df_1, lat="Lat_round", lon="Lon_round", mapbox_style="carto-positron", zoom=10, size='number_of_demand', animation_frame = "Hour", color='number_of_demand', color_continuous_scale=px.colors.diverging.RdYlGn[::-1])
fig.show("iframe_connected")

In [11]:
df['Lon'].head()

601224   -73.9912
535292   -73.9806
27194    -73.9798
11432    -73.9857
411703   -73.9466
Name: Lon, dtype: float64

In [12]:
#On selectionne uniquement les colonnes pertinentes
X = df.select_dtypes(include=['int','float']).drop(["Lat_round","Lon_round","Day","Hour","Month"] , axis=1)

In [13]:
X.head()

Unnamed: 0,Lat,Lon
601224,40.7326,-73.9912
535292,40.6885,-73.9806
27194,40.7493,-73.9798
11432,40.752,-73.9857
411703,40.7793,-73.9466


1. Méthode KMEANS

In [14]:
# import KMeans from sklearn 
from sklearn.cluster import KMeans

# Instanciate KMeans 
kmeans = KMeans(n_clusters=10)

# Fit sur les données
kmeans.fit(X)

# Visualisation via plotly
import plotly.graph_objects as go

fig = go.Figure()

# Boucle permettant d'ajouter les points au graphique (ici on a 10 clusters donc il y aura 10 add_trace)
for i in range(kmeans.n_clusters):
    label = X[kmeans.predict(X) == i] #10 Dataset différents.
    fig.add_trace(go.Scatter(x=label.iloc[:, 0], y=label.iloc[:, 1], mode="markers", name="Cluster {}".format(i)))

fig.show('iframe_connected')

In [15]:
# Intégrer les clusters dans le dataframe
df['no_Cluster_km'] = kmeans.predict(X)

In [16]:
df.head()

Unnamed: 0,Date/Time,Lat,Lon,Base,Date,Month,Day,Hour,Lat_round,Lon_round,Lat_Lon,no_Cluster_km
601224,2014-08-31 13:21:00,40.7326,-73.9912,B02617,2014-08-31,8,6,13,40.73,-73.99,40.73-73.99,8
535292,2014-08-24 22:58:00,40.6885,-73.9806,B02617,2014-08-24,8,6,22,40.69,-73.98,40.69-73.98,3
27194,2014-08-27 01:40:00,40.7493,-73.9798,B02512,2014-08-27,8,2,1,40.75,-73.98,40.75-73.98,0
11432,2014-08-11 09:33:00,40.752,-73.9857,B02512,2014-08-11,8,0,9,40.75,-73.99,40.75-73.99,0
411703,2014-08-14 07:36:00,40.7793,-73.9466,B02617,2014-08-14,8,3,7,40.78,-73.95,40.78-73.95,0


In [17]:
#Maps pour la méthode KMEANS

df = df.sort_values("Hour", ascending=True)

fig = px.scatter_mapbox(df, lat="Lat", lon="Lon", mapbox_style="carto-positron", zoom=10, animation_frame = "Hour", color='no_Cluster_km')
fig.show("iframe_connected")

2. Méthode DBSCAN

In [18]:
from sklearn.cluster import DBSCAN

# Instanciation de DBSCAN 
db = DBSCAN(eps=0.005, min_samples=5, metric="euclidean", algorithm="brute")

X = df[["Lat","Lon"]]

# Fit sur les data 
db.fit(X)

# Visualisation via plotly
import plotly.graph_objects as go
import numpy as np

fig = go.Figure()

# Boucle permettant d'ajouter les points au graphique (ici on a 10 clusters donc il y aura 10 add_trace)

for i in np.unique(db.labels_):
    label = X[db.labels_ == i]
    fig.add_trace(go.Scatter(x=label.iloc[:, 0], y=label.iloc[:, 1], mode="markers", name="Cluster {}".format(i)))

fig.show('iframe_connected')

In [19]:
df['no_Cluster_db'] = db.labels_

In [22]:
#Maps pour la méthode DBSCAN

df = df.sort_values("Hour", ascending=True)

fig = px.scatter_mapbox(df, lat="Lat", lon="Lon", mapbox_style="carto-positron", animation_frame = "Hour", zoom=9, color='no_Cluster_db')
fig.show("iframe_connected")