In [1]:
import pandas as pd

!pip install plotly==4.9.0
import plotly.io as pio
pio.renderers.default = "iframe_connected"

import numpy as np 

from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import  silhouette_score

import plotly.express as px
import plotly.io as pio
pio.renderers.default = "colab"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting plotly==4.9.0
  Downloading plotly-4.9.0-py2.py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting retrying>=1.3.3 (from plotly==4.9.0)
  Downloading retrying-1.3.4-py3-none-any.whl (11 kB)
Installing collected packages: retrying, plotly
  Attempting uninstall: plotly
    Found existing installation: plotly 5.13.1
    Uninstalling plotly-5.13.1:
      Successfully uninstalled plotly-5.13.1
Successfully installed plotly-4.9.0 retrying-1.3.4


In [3]:
df = pd.read_csv('uber-raw-data-jun14.csv')
df = df.sample(30000)

In [4]:
df.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
236067,6/25/2014 21:10:00,40.74,-73.9907,B02598
290585,6/4/2014 14:29:00,40.7375,-73.9911,B02617
577400,6/18/2014 16:06:00,40.7517,-73.9938,B02682
54257,6/3/2014 23:03:00,40.7335,-73.9973,B02598
332242,6/11/2014 18:49:00,40.7475,-73.9816,B02617


In [5]:
# Basic stats
print("Number of rows : {}".format(df.shape[0]))
print("Number of columns : {}".format(df.shape[1]))
print()


print("Percentage of missing values: ")
display(100*df.isnull().sum()/df.shape[0])
print()
print()
print('infos')
display(df.info())
print()
print("basics stats...")
display(df.describe(include = "all"))

Number of rows : 30000
Number of columns : 4

Percentage of missing values: 


Date/Time    0.0
Lat          0.0
Lon          0.0
Base         0.0
dtype: float64



infos
<class 'pandas.core.frame.DataFrame'>
Int64Index: 30000 entries, 236067 to 363301
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date/Time  30000 non-null  object 
 1   Lat        30000 non-null  float64
 2   Lon        30000 non-null  float64
 3   Base       30000 non-null  object 
dtypes: float64(2), object(2)
memory usage: 1.1+ MB


None


basics stats...


Unnamed: 0,Date/Time,Lat,Lon,Base
count,30000,30000.0,30000.0,30000
unique,19778,,,5
top,6/13/2014 17:59:00,,,B02598
freq,8,,,10950
mean,,40.739913,-73.97422,
std,,0.037997,0.056051,
min,,40.3486,-74.7553,
25%,,40.722,-73.9967,
50%,,40.7434,-73.9835,
75%,,40.7613,-73.9673,


In [6]:
df['Date/Time']= pd.to_datetime(df['Date/Time'])

print()
print()

df['Dayofweek'] = df['Date/Time'].dt.dayofweek
df['Day'] = df['Date/Time'].dt.day
df['Hours'] = df['Date/Time'].dt.hour

# Let's drop the column Date/Time
df = df.drop(["Date/Time"], axis=1)

df.dtypes






Lat          float64
Lon          float64
Base          object
Dayofweek      int64
Day            int64
Hours          int64
dtype: object

In [7]:
df.head()

Unnamed: 0,Lat,Lon,Base,Dayofweek,Day,Hours
236067,40.74,-73.9907,B02598,2,25,21
290585,40.7375,-73.9911,B02617,2,4,14
577400,40.7517,-73.9938,B02682,2,18,16
54257,40.7335,-73.9973,B02598,1,3,23
332242,40.7475,-73.9816,B02617,2,11,18


In [8]:
print(df["Dayofweek"].unique())
print()
print(df["Day"].unique())
print()
print(df["Hours"].unique())


[2 1 3 6 0 4 5]

[25  4 18  3 11 12 26 29  8  9  6 21 13 15 30 27 19 14 10  7  2  1  5 28
 22 16 23 17 24 20]

[21 14 16 23 18  3  7 17  2  9  1 22  0 20 19 10  6  4 11  8 15 12 13  5]


In [9]:
# Let's see how many days we have in dataset
days_week = df.groupby("Dayofweek").nunique()
display(days_week)

Unnamed: 0_level_0,Lat,Lon,Base,Day,Hours
Dayofweek,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1261,1126,5,5,24
1,1161,971,5,4,24
2,1205,1026,5,4,24
3,1249,1090,5,4,24
4,1251,1073,5,4,24
5,1267,1079,5,4,24
6,1286,1183,5,5,24


In [10]:
df = df.drop(["Base"], axis=1)

In [11]:
df.head()

Unnamed: 0,Lat,Lon,Dayofweek,Day,Hours
236067,40.74,-73.9907,2,25,21
290585,40.7375,-73.9911,2,4,14
577400,40.7517,-73.9938,2,18,16
54257,40.7335,-73.9973,1,3,23
332242,40.7475,-73.9816,2,11,18


In [None]:
## map without clustering
fig = px.scatter_mapbox(
        df, 
        lat="Lat", 
        lon="Lon",
        color="Dayofweek",
        mapbox_style="carto-positron"
)

fig.show()


In [None]:
df.head()

Unnamed: 0,Lat,Lon,Dayofweek,Day,Hours
178368,40.7405,-73.984,2,18,15
265980,40.7193,-73.9888,6,29,14
63482,40.76,-73.9739,3,5,2
222332,40.7208,-73.9953,1,24,9
277061,40.752,-73.9799,6,1,13


In [12]:
df_0 = df.loc[df['Dayofweek'] == 0]
df_1 = df.loc[df['Dayofweek'] == 1]
df_2 = df.loc[df['Dayofweek'] == 2]
df_3 = df.loc[df['Dayofweek'] == 3]
df_4 = df.loc[df['Dayofweek'] == 4]
df_5 = df.loc[df['Dayofweek'] == 5]
df_6 = df.loc[df['Dayofweek'] == 6]

In [None]:
df_4

Unnamed: 0,Lat,Lon,Dayofweek,Day,Hours
137456,40.7714,-73.9637,4,13,8
142327,40.7639,-73.9764,4,13,17
144014,40.7650,-73.9827,4,13,19
75615,40.8134,-73.9371,4,6,7
74930,40.7576,-73.9740,4,6,6
...,...,...,...,...,...
301485,40.7752,-73.9451,4,6,7
196527,40.7601,-73.9721,4,20,14
144050,40.7610,-73.9754,4,13,19
301944,40.7325,-73.9760,4,6,9


###KMEANS

---



---



In [42]:
numeric_features = ["Lat", "Lon", "Dayofweek", "Day", "Hours"] # Positions des colonnes quantitatives dans X
numeric_transformer = StandardScaler()

# On combine les transformers dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(df_4.head())
X = preprocessor.fit_transform(df_4) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
            Lat      Lon  Dayofweek  Day  Hours  cluster_DB  cluster_DB_Lat  \
300871  40.7598 -73.9009          4    6      1          -1       40.738695   
15549   40.6488 -73.6778          4   13     20          -1       40.738695   
253102  40.7459 -73.9901          4   27     18          -1       40.738695   
140348  40.7732 -73.9547          4   13     14          -1       40.738695   
74777   40.7659 -73.9889          4    6      4          -1       40.738695   

        cluster_DB_Lon  
300871       -73.97354  
15549        -73.97354  
253102       -73.97354  
140348       -73.97354  
74777        -73.97354  
...Terminé.
[[ 0.53118059  1.4598624   0.         -1.35457275 -2.29701468]
 [-2.56522699  5.78752134  0.         -0.43197774  0.94620398]
 [ 0.14343225 -0.27042526  0.          1.41321226  0.60481254]
 [ 0.90498114  0.41625841  0.         -0.43197774 -0.07797033]
 [ 0.70134353 -0.24714784  0.         -1.35457275 -1.78492752]]



In [None]:
import warnings

# ignorer tous les avertissements de scikit-learn
warnings.filterwarnings(action='ignore', category=FutureWarning, module='sklearn')


wcss =  []
k = []
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i, random_state = 0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    k.append(i)
    print("WCSS for K={} --> {}".format(i, wcss[-1]))
    


WCSS for K=2 --> 13396.673148270776
WCSS for K=3 --> 11228.061571122733
WCSS for K=4 --> 9255.410154845715
WCSS for K=5 --> 8054.286129345525
WCSS for K=6 --> 7097.976627781125
WCSS for K=7 --> 6436.008575001641
WCSS for K=8 --> 5842.363807156739
WCSS for K=9 --> 5433.8099416047735
WCSS for K=10 --> 5112.820409148041


In [None]:
# Create DataFrame
wcss_frame = pd.DataFrame(wcss)
k_frame = pd.Series(k)

# Create figure
fig= px.line(
    wcss_frame,
    x=k_frame,
    y=wcss_frame.iloc[:,-1]
)

# Create title and axis labels
fig.update_layout(
    yaxis_title="Inertia",
    xaxis_title="# Clusters",
    title="Inertia per cluster"
)

# Render
#fig.show(renderer="notebook")
fig.show(renderer="colab") # if using workspace

In [None]:
# Utilisation du silhouette_score pour déterminer le nombre optimal de clusters
sil = [] = []
k= []
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i, random_state = 0)
    kmeans.fit(X)
    sil.append(silhouette_score(X, kmeans.predict(X)))
    k.append(i)
    print("Silhouette score for K={} is {}".format(i, sil[-1]))



Silhouette score for K=2 is 0.253843471898721
Silhouette score for K=3 is 0.27273057391057587
Silhouette score for K=4 is 0.2699038695062056
Silhouette score for K=5 is 0.252771219669122
Silhouette score for K=6 is 0.2741593591405563
Silhouette score for K=7 is 0.27703111505796335
Silhouette score for K=8 is 0.2894060705498939
Silhouette score for K=9 is 0.2667445249654008
Silhouette score for K=10 is 0.2491377913106152


In [None]:
# Create a data frame 
cluster_scores=pd.DataFrame(sil)
k_frame = pd.Series(k)

# Create figure
fig = px.bar(data_frame=cluster_scores,  
             x=k, 
             y=cluster_scores.iloc[:, -1]
            )

# Add title and axis labels
fig.update_layout(
    yaxis_title="Silhouette Score",
    xaxis_title="# Clusters",
    title="Silhouette Score per cluster"
)

# Render
#fig.show(renderer="notebook")
fig.show(renderer="colab") # if using workspace

In [None]:
# On ré-entraîne un KMeans avec le nombre optimal de clusters
kmeans = KMeans(n_clusters= 7)
kmeans.fit(X)

In [None]:
df_4_copy = df_4.copy()
df_4_copy['Cluster_KMeans'] = kmeans.predict(X)
df_4_copy.head()

Unnamed: 0,Lat,Lon,Dayofweek,Day,Hours,Cluster_KMeans
137456,40.7714,-73.9637,4,13,8,3
142327,40.7639,-73.9764,4,13,17,2
144014,40.765,-73.9827,4,13,19,2
75615,40.8134,-73.9371,4,6,7,3
74930,40.7576,-73.974,4,6,6,3


In [None]:
# Now let's do a map that allow us to see the pickups depending of clusters on the map
fig = px.scatter_mapbox(df_4_copy, 
                        lat = "Lat", 
                        lon = "Lon", 
                        mapbox_style = "carto-positron",
                        color = "Cluster_KMeans",  
                        zoom=4.5)
fig.show()

DBSCAN

In [15]:
# Instanciate DBSCAN 
db = DBSCAN(eps=0.3, min_samples=20, metric="manhattan", algorithm="brute")

# Fit on data 
## No need to normalize data, it already is! 
db.fit(X)

In [62]:
np.unique(db.labels_)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9])

In [63]:
df_4["cluster_DB"] = db.labels_
df_4.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Lat,Lon,Dayofweek,Day,Hours,cluster_DB,cluster_DB_Lat,cluster_DB_Lon
300871,40.7598,-73.9009,4,6,1,-1,40.738695,-73.97354
15549,40.6488,-73.6778,4,13,20,-1,40.738695,-73.97354
253102,40.7459,-73.9901,4,27,18,-1,40.738695,-73.97354
140348,40.7732,-73.9547,4,13,14,-1,40.738695,-73.97354
74777,40.7659,-73.9889,4,6,4,-1,40.738695,-73.97354


In [18]:
df_4["cluster_DB"] = db.labels_
df_4.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Lat,Lon,Dayofweek,Day,Hours,cluster_DB
300871,40.7598,-73.9009,4,6,1,-1
15549,40.6488,-73.6778,4,13,20,-1
253102,40.7459,-73.9901,4,27,18,-1
140348,40.7732,-73.9547,4,13,14,-1
74777,40.7659,-73.9889,4,6,4,-1


In [19]:
db.components_

array([[ 0.55070748,  0.17184558,  0.        , -0.43197774,  0.43411682],
       [ 0.09879935, -0.14239949,  0.        , -1.35457275,  0.60481254],
       [ 0.29685785, -0.11136294,  0.        , -1.35457275,  0.26342111],
       ...,
       [-0.21642052, -0.54393485,  0.        ,  1.41321226,  1.28759542],
       [ 0.34706987, -0.08808553,  0.        ,  1.41321226,  0.26342111],
       [-0.33358189, -0.47022305,  0.        ,  0.49061726,  1.28759542]])

In [35]:
df_4['cluster_DB'].value_counts()

-1    3631
 0     395
 1     177
 2     123
 3     115
 5      77
 4      71
 6      52
 9      34
 8      26
 7      19
Name: cluster_DB, dtype: int64

In [27]:
# Let's visualize all the clusters on a map except all the ones that DBSCAN considered as outliers
fig = px.scatter_mapbox(
        df_4[df_4["cluster_DB"] != -1], 
        lat="Lat", 
        lon="Lon",
        color="cluster_DB",
        mapbox_style="carto-positron"
)

fig.show()

In [22]:
# Let's visualize these clusters on the map hour per hour
fig = px.scatter_mapbox(
        df_4[df_4["cluster_DB"] != -1], 
        lat="Lat", 
        lon="Lon",
        animation_frame="Hours",
        color="cluster_DB",
        mapbox_style="carto-positron"
)

fig.show()

In [31]:
numeric_features = ["Lat", "Lon", "Dayofweek", "Day", "Hours"] # Positions des colonnes quantitatives dans X
numeric_transformer = StandardScaler()

# On combine les transformers dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(df_4.head())
Xx = preprocessor.fit_transform(df) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
            Lat      Lon  Dayofweek  Day  Hours  cluster_DB  cluster_DB_Lat  \
300871  40.7598 -73.9009          4    6      1          -1       40.738695   
15549   40.6488 -73.6778          4   13     20          -1       40.738695   
253102  40.7459 -73.9901          4   27     18          -1       40.738695   
140348  40.7732 -73.9547          4   13     14          -1       40.738695   
74777   40.7659 -73.9889          4    6      4          -1       40.738695   

        cluster_DB_Lon  
300871       -73.97354  
15549        -73.97354  
253102       -73.97354  
140348       -73.97354  
74777        -73.97354  
...Terminé.
[[ 0.0022791  -0.29401485 -0.48152198  1.13391069  1.15701698]
 [-0.06351718 -0.30115127 -0.48152198 -1.33651714 -0.03144908]
 [ 0.31020564 -0.34932211 -0.48152198  0.31043475  0.30811265]
 [-0.16879121 -0.4117658  -1.00139311 -1.45415656  1.49657871]
 [ 0.19966791 -0.13166126 -0.48152198 -0.5130412   0.64767438]]



In [32]:
# Instanciate DBSCAN 
db = DBSCAN(eps=0.3, min_samples=20, metric="manhattan", algorithm="brute")

# Fit on data 
## No need to normalize data, it already is! 
db.fit(Xx)

In [33]:
df["cluster_DB"] = db.labels_
df.head()

Unnamed: 0,Lat,Lon,Dayofweek,Day,Hours,cluster_DB
236067,40.74,-73.9907,2,25,21,3
290585,40.7375,-73.9911,2,4,14,-1
577400,40.7517,-73.9938,2,18,16,0
54257,40.7335,-73.9973,1,3,23,-1
332242,40.7475,-73.9816,2,11,18,15


In [34]:
# Let's visualize all the clusters on a map day per day
fig = px.scatter_mapbox(
        df[df["cluster_DB"] != -1],  
        lat="Lat", 
        lon="Lon",
        animation_frame="Dayofweek",
        color="cluster_DB",
        mapbox_style="carto-positron"
)

fig.show()

In [25]:
# for each db.labels_ value, create a dictionary with the corresponding cluster centroid longitude and latitude
centroid_dict = {i: [df_4[df_4["cluster_DB"] == i]["Lon"].mean(), df_4[df_4["cluster_DB"] == i]["Lat"].mean()] for i in df_4["cluster_DB"].unique()}
# create two columns latitude and longitude for the corresponding cluster centroid
df_4.loc[:, "cluster_DB_Lat"] = df_4["cluster_DB"].map(lambda x: centroid_dict[x][1])
df_4.loc[:, "cluster_DB_Lon"] = df_4["cluster_DB"].map(lambda x: centroid_dict[x][0])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
# Map of clusters centroids per Day of week
fig = px.scatter_mapbox(df_4[df_4["cluster_DB"] != -1], lat="cluster_DB_Lat", lon="cluster_DB_Lon", color="cluster_DB", zoom=8, mapbox_style="carto-positron", animation_frame="Dayofweek", category_orders={"Hours": range(24)}, height=500, width=800)
fig.show()

In [26]:
# Map of clusters centroids per Hour

fig = px.scatter_mapbox(df_4[df_4["cluster_DB"] != -1], lat="cluster_DB_Lat", lon="cluster_DB_Lon", color="cluster_DB", zoom=8, mapbox_style="carto-positron", animation_frame="Hours", category_orders={"Hours": range(24)}, height=500, width=800)
fig.show()

In [None]:
# Map of clusters centroids per Hour

#fig = px.scatter_mapbox(df_4[df_4["cluster_DB"] != -1], lat="cluster_DB_Lat", lon="cluster_DB_Lon", color="cluster_DB", zoom=8, mapbox_style="carto-positron", animation_frame="Hours", category_orders={"Hours": range(24)}, height=500, width=800)
#fig.show()

In [None]:
# Créer une copie de la dataframe
#df_clustered = df_4.copy()

# Pour chaque cluster, calculer les coordonnées du centroïde
#centroids = df_clustered.groupby('cluster_DB').agg({'Lat': 'mean', 'Lon': 'mean'})

# Créer un dictionnaire de centroïdes
#centroids_dict = {i: [centroids.loc[i, 'Lat'], centroids.loc[i, 'Lon']] for i in centroids.index}

# Ajouter les colonnes cluster_DB_Lat et cluster_DB_Lon à la dataframe
#df_clustered['cluster_DB_Lat'] = df_clustered['cluster_DB'].map(lambda x: centroids_dict[x][0])
#df_clustered['cluster_DB_Lon'] = df_clustered['cluster_DB'].map(lambda x: centroids_dict[x][1])