Project of unsupervised machine learning for the Jedha certinfication, about Uber pickups.

Author : Youenn PATAT

<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/5/58/Uber_logo_2018.svg/1024px-Uber_logo_2018.svg.png" alt="UBER LOGO" width="50%" />

# 1) Checking the data

In [74]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, FunctionTransformer, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import  silhouette_score, davies_bouldin_score

In [75]:
df = pd.read_csv("uber-trip-data/uber-raw-data-aug14.csv")
print(df.shape)
df.head()

(136970, 4)


Unnamed: 0,Date/Time,Lat,Lon,Base
0,8/1/2014 0:03:00,40.7366,-73.9906,B02512
1,8/1/2014 0:09:00,40.726,-73.9918,B02512
2,8/1/2014 0:12:00,40.7209,-74.0507,B02512
3,8/1/2014 0:12:00,40.7387,-73.9856,B02512
4,8/1/2014 0:12:00,40.7323,-74.0077,B02512


In [76]:
print("Number of rows :", df.shape[0])

print("Some statistics :")
display(df.describe(include="all"))

print("% of missing values :")
display(100*df.isnull().sum()/df.shape[0])

Number of rows : 136970
Some statistics :


Unnamed: 0,Date/Time,Lat,Lon,Base
count,136970,136970.0,136970.0,136969
unique,29589,,,2
top,8/12/2014 18:32:00,,,B02598
freq,30,,,105497
mean,,40.738651,-73.972619,
std,,0.047322,0.063382,
min,,39.6569,-74.7737,
25%,,40.7205,-73.9967,
50%,,40.7422,-73.9835,
75%,,40.7611,-73.9649,


% of missing values :


Unnamed: 0,0
Date/Time,0.0
Lat,0.0
Lon,0.0
Base,0.00073


# 2) Preprocessing

In [77]:
data_aug = df

In [78]:
data_aug["Date/Time"] = pd.to_datetime(data_aug["Date/Time"], format="%m/%d/%Y %H:%M:%S")

data_aug["Years"] = data_aug["Date/Time"].dt.year
data_aug["Months"] = data_aug["Date/Time"].dt.month
data_aug["Day"] = data_aug["Date/Time"].dt.day
data_aug["DayOfWeek"] = data_aug["Date/Time"].dt.dayofweek
data_aug["Hours"] = data_aug["Date/Time"].dt.hour
data_aug["Minutes"] = data_aug["Date/Time"].dt.minute
data_aug = data_aug.drop("Date/Time", axis=1)

data_aug.head()

Unnamed: 0,Lat,Lon,Base,Years,Months,Day,DayOfWeek,Hours,Minutes
0,40.7366,-73.9906,B02512,2014,8,1,4,0,3
1,40.726,-73.9918,B02512,2014,8,1,4,0,9
2,40.7209,-74.0507,B02512,2014,8,1,4,0,12
3,40.7387,-73.9856,B02512,2014,8,1,4,0,12
4,40.7323,-74.0077,B02512,2014,8,1,4,0,12


In [79]:
data_aug["Day"].value_counts()

Unnamed: 0_level_0,count
Day,Unnamed: 1_level_1
1,10734
7,10575
6,9901
8,9806
12,9326
13,9283
2,9270
5,8791
9,8310
4,7969


In [80]:
data_aug["Hours"].value_counts()

Unnamed: 0_level_0,count
Hours,Unnamed: 1_level_1
17,10115
18,9828
16,9362
19,8638
20,8382
15,8254
21,8217
14,7126
22,7074
13,6015


In [81]:
#Selection of data to see with only 1 day at a specific hour
data_aug_selected = data_aug[ (data_aug["Years"] == 2014) & (data_aug["Months"] == 8) & (data_aug["Day"] == 7) & (data_aug["Hours"] == 17)]
print(data_aug_selected.shape[0])
data_aug_selected.head()

806


Unnamed: 0,Lat,Lon,Base,Years,Months,Day,DayOfWeek,Hours,Minutes
7714,40.7793,-73.9554,B02512,2014,8,7,3,17,0
7715,40.7634,-73.9806,B02512,2014,8,7,3,17,0
7716,40.7624,-73.9855,B02512,2014,8,7,3,17,2
7717,40.7427,-73.9867,B02512,2014,8,7,3,17,3
7718,40.7548,-73.9886,B02512,2014,8,7,3,17,3


In [82]:
numeric_features_no_change = [0, 1]
numeric_transformer_no_change = Pipeline(steps=[
   ('passthrough', FunctionTransformer(lambda x: x))
])

#Here we only needed one preprocessing, that doesn't change the geo num values
#The values due to dates and time are not needed in the model
#The Base code creates to much categorical column in preprocessing to handle it with clustering model

preprocessor = ColumnTransformer(
    transformers=[
        ('num_no_change', numeric_transformer_no_change, numeric_features_no_change),
    ])

# Preprocessings on dataset
print("Preprocessing sur le train set...")
print(data_aug_selected.head())
X = preprocessor.fit_transform(data_aug_selected) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
          Lat      Lon    Base  Years  Months  Day  DayOfWeek  Hours  Minutes
7714  40.7793 -73.9554  B02512   2014       8    7          3     17        0
7715  40.7634 -73.9806  B02512   2014       8    7          3     17        0
7716  40.7624 -73.9855  B02512   2014       8    7          3     17        2
7717  40.7427 -73.9867  B02512   2014       8    7          3     17        3
7718  40.7548 -73.9886  B02512   2014       8    7          3     17        3
...Terminé.
[[ 40.7793 -73.9554]
 [ 40.7634 -73.9806]
 [ 40.7624 -73.9855]
 [ 40.7427 -73.9867]
 [ 40.7548 -73.9886]]



# 3) KMeans for 1 day at specific hour

In [84]:
wcss =  []
sil = []
for i in range (2,11):
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    sil.append(silhouette_score(X, kmeans.predict(X)))

print(wcss)
print(sil)

[1.6719765986107098, 1.2571042385232445, 1.1678999458099117, 0.7929704438278673, 0.7372641996919206, 0.5101546656213702, 0.37701231882683195, 0.3104939263180952, 0.24706012214481934]
[0.7551344819372698, 0.46076576935672875, 0.6857499870785713, 0.4930204670432985, 0.44267084399208206, 0.5119693524820425, 0.48076865341071845, 0.4130569720080754, 0.43552700671719335]


In [85]:
fig = px.line(x = range(2,11), y = wcss, height=600, width=800)
fig.show()


In [86]:
fig = px.bar(x = range(2,11), y = sil, height=600, width=800)
fig.show()

In [87]:
kmeans = KMeans(n_clusters= 4)
kmeans.fit(X)

In [89]:
data_aug_selected.loc[:,'Cluster_KMeans'] = kmeans.predict(X)
data_aug_selected.head()

Unnamed: 0,Lat,Lon,Base,Years,Months,Day,DayOfWeek,Hours,Minutes,Cluster_KMeans
7714,40.7793,-73.9554,B02512,2014,8,7,3,17,0,3
7715,40.7634,-73.9806,B02512,2014,8,7,3,17,0,3
7716,40.7624,-73.9855,B02512,2014,8,7,3,17,2,3
7717,40.7427,-73.9867,B02512,2014,8,7,3,17,3,3
7718,40.7548,-73.9886,B02512,2014,8,7,3,17,3,3


In [90]:
px.scatter_mapbox(
    data_aug_selected,
    lat="Lat",
    lon="Lon",
    color="Cluster_KMeans",
    mapbox_style="carto-positron",
    zoom=10,
    height=600,
    width=800
)

# 4) DBSCAN for 1 day at specific hour

In [91]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

neigh = NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(X)
distances, indices = nbrs.kneighbors(X)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
px.line(distances, height=600, width=800)

In [155]:
davies_bouldin =  []
sil = []
for i in np.arange(0.001,0.1, 0.001):
    db = DBSCAN(eps=i, min_samples=4, metric="euclidean")
    db.fit(X)
    davies_bouldin.append(davies_bouldin_score(X, db.labels_))
    sil.append(silhouette_score(X, db.labels_))

print(davies_bouldin)
print(sil)

[2.3404241863540776, 1.6558306248694097, 1.8008343803967208, 1.6817583470681063, 1.286515783459205, 1.572166160631851, 2.2665151424984575, 2.524637897915719, 2.5981942019524764, 2.4317709178384592, 2.3040451879440407, 1.6501952234356583, 1.9111690552518554, 1.8411848888541364, 1.8411848888541364, 1.7990994530783269, 1.9230564192907031, 1.9230564192907031, 1.8817542913836018, 2.244616485055002, 2.4456513171745553, 2.473564681135005, 2.607804246737779, 2.5661013061320497, 2.5948585883467614, 2.8464043560867216, 2.8464043560867216, 2.8464043560867216, 2.8464043560867216, 2.701788025911451, 2.701788025911451, 2.701788025911451, 2.701788025911451, 2.0424273805875717, 2.0424273805875717, 2.2052847557096835, 1.66430852979407, 1.3604490101397204, 1.3687615336342323, 1.3687615336342323, 1.3687615336342323, 0.7058070124561059, 0.7058070124561059, 0.7058070124561059, 0.7058070124561059, 0.7058070124561059, 0.7058070124561059, 0.7058070124561059, 0.5810674771615706, 0.5810674771615706, 0.581067477

In [156]:

fig = px.line(x = np.arange(0.001,0.1, 0.001), y = davies_bouldin, height=600, width=800)
fig.show()


In [157]:
fig = px.bar(x = np.arange(0.001,0.1, 0.001), y = sil, height=600, width=800)
fig.show()

In [158]:
db = DBSCAN(eps=0.05, min_samples=4, metric="euclidean")
# Due to the 2 previous graphs, we choose eps = 0.05 to have a best silouhette score and davies bouldin score
# min_sample = 4 because it is better to choose (2*nb of features) here lat and lon
db.fit(X)

In [160]:
data_aug_selected["Cluster_DBSCAN"] = db.labels_
data_aug_selected.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Lat,Lon,Base,Years,Months,Day,DayOfWeek,Hours,Minutes,Cluster_KMeans,Cluster_DBSCAN
7714,40.7793,-73.9554,B02512,2014,8,7,3,17,0,3,0
7715,40.7634,-73.9806,B02512,2014,8,7,3,17,0,3,0
7716,40.7624,-73.9855,B02512,2014,8,7,3,17,2,3,0
7717,40.7427,-73.9867,B02512,2014,8,7,3,17,3,3,0
7718,40.7548,-73.9886,B02512,2014,8,7,3,17,3,3,0


In [161]:
# silouhette score, nearest 1 is better
silhouette = silhouette_score(X, db.labels_) if len(set(db.labels_)) > 1 else -1
print(f"Silhouette Score : {silhouette}")

# Davies Bouldin score, nearest 0 is better
davies_bouldin = davies_bouldin_score(X, db.labels_)
print(f"Davies-Bouldin Score : {davies_bouldin}")

Silhouette Score : 0.7509334492298034
Davies-Bouldin Score : 0.5810674771615706


In [164]:
px.scatter_mapbox(
    data_aug_selected[data_aug_selected.Cluster_DBSCAN != -1],
    lat="Lat",
    lon="Lon",
    color="Cluster_DBSCAN",
    mapbox_style="carto-positron",
    zoom=9,
    height=600,
    width=800
)

# 5) Generalization for the DayOfWeek

In [102]:
# dictionnary creation to stock the dataframe corresponding to each day of week
dict_df_per_day = {}

for day in data_aug["DayOfWeek"].unique():
    dict_df_per_day[f"data_aug_day_{day}"] = data_aug[(data_aug["DayOfWeek"] == day)]


In [103]:
numeric_features_no_change = [0, 1]
numeric_transformer_no_change = Pipeline(steps=[
   ('passthrough', FunctionTransformer(lambda x: x))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num_no_change', numeric_transformer_no_change, numeric_features_no_change),
    ])


X_dict = {}

for i in range(0, 7):
    # Preprocessings sur le dataset
    print(f"Preprocessing pour le DayOfWeek {i}...")
    print(dict_df_per_day[f"data_aug_day_{i}"].head())
    X_dict[f"X_{i}"] = preprocessor.fit_transform(dict_df_per_day[f"data_aug_day_{i}"]) # fit_transform !!
    print('...Terminé.')

Preprocessing pour le DayOfWeek 0...
          Lat      Lon    Base  Years  Months  Day  DayOfWeek  Hours  Minutes
3318  40.7277 -73.9981  B02512   2014       8    4          0      0        3
3319  40.7415 -73.9994  B02512   2014       8    4          0      0        3
3320  40.7554 -73.9870  B02512   2014       8    4          0      0       10
3321  40.8297 -73.9479  B02512   2014       8    4          0      0       10
3322  40.7203 -73.9980  B02512   2014       8    4          0      0       14
...Terminé.
Preprocessing pour le DayOfWeek 1...
          Lat      Lon    Base  Years  Months  Day  DayOfWeek  Hours  Minutes
4463  40.7640 -73.9731  B02512   2014       8    5          1      0        9
4464  40.9872 -74.1832  B02512   2014       8    5          1      0       12
4465  40.7271 -73.9802  B02512   2014       8    5          1      0       14
4466  40.6407 -73.9189  B02512   2014       8    5          1      0       18
4467  40.7244 -73.9906  B02512   2014       8    5      

## With KMeans

In [106]:
for i in range(7):

    kmeans = KMeans(n_clusters= 4)
    kmeans.fit(X_dict[f"X_{i}"])

    dict_df_per_day[f"data_aug_day_{i}"].loc[:,'Cluster_KMeans'] = kmeans.predict(X_dict[f"X_{i}"])
    display(dict_df_per_day[f"data_aug_day_{i}"].head())

Unnamed: 0,Lat,Lon,Base,Years,Months,Day,DayOfWeek,Hours,Minutes,Cluster_KMeans
3318,40.7277,-73.9981,B02512,2014,8,4,0,0,3,3
3319,40.7415,-73.9994,B02512,2014,8,4,0,0,3,3
3320,40.7554,-73.987,B02512,2014,8,4,0,0,10,3
3321,40.8297,-73.9479,B02512,2014,8,4,0,0,10,1
3322,40.7203,-73.998,B02512,2014,8,4,0,0,14,3


Unnamed: 0,Lat,Lon,Base,Years,Months,Day,DayOfWeek,Hours,Minutes,Cluster_KMeans
4463,40.764,-73.9731,B02512,2014,8,5,1,0,9,3
4464,40.9872,-74.1832,B02512,2014,8,5,1,0,12,3
4465,40.7271,-73.9802,B02512,2014,8,5,1,0,14,0
4466,40.6407,-73.9189,B02512,2014,8,5,1,0,18,0
4467,40.7244,-73.9906,B02512,2014,8,5,1,0,32,0


Unnamed: 0,Lat,Lon,Base,Years,Months,Day,DayOfWeek,Hours,Minutes,Cluster_KMeans
5630,40.7408,-74.0054,B02512,2014,8,6,2,0,15,0
5631,40.6948,-74.1778,B02512,2014,8,6,2,0,19,3
5632,41.0384,-73.7588,B02512,2014,8,6,2,0,20,1
5633,40.7618,-73.9746,B02512,2014,8,6,2,0,22,2
5634,40.7166,-73.9037,B02512,2014,8,6,2,0,37,2


Unnamed: 0,Lat,Lon,Base,Years,Months,Day,DayOfWeek,Hours,Minutes,Cluster_KMeans
6944,40.7455,-73.9895,B02512,2014,8,7,3,0,3,1
6945,40.7499,-74.0025,B02512,2014,8,7,3,0,9,1
6946,40.7417,-74.0037,B02512,2014,8,7,3,0,10,0
6947,40.6416,-73.7879,B02512,2014,8,7,3,0,16,3
6948,40.7008,-73.9405,B02512,2014,8,7,3,0,16,0


Unnamed: 0,Lat,Lon,Base,Years,Months,Day,DayOfWeek,Hours,Minutes,Cluster_KMeans
0,40.7366,-73.9906,B02512,2014,8,1,4,0,3,0
1,40.726,-73.9918,B02512,2014,8,1,4,0,9,0
2,40.7209,-74.0507,B02512,2014,8,1,4,0,12,3
3,40.7387,-73.9856,B02512,2014,8,1,4,0,12,0
4,40.7323,-74.0077,B02512,2014,8,1,4,0,12,0


Unnamed: 0,Lat,Lon,Base,Years,Months,Day,DayOfWeek,Hours,Minutes,Cluster_KMeans
1342,40.7402,-74.0058,B02512,2014,8,2,5,0,1,0
1343,40.7439,-73.9836,B02512,2014,8,2,5,0,4,0
1344,40.7422,-74.0042,B02512,2014,8,2,5,0,4,0
1345,40.7459,-73.9919,B02512,2014,8,2,5,0,5,0
1346,40.6955,-74.1781,B02512,2014,8,2,5,0,9,0


Unnamed: 0,Lat,Lon,Base,Years,Months,Day,DayOfWeek,Hours,Minutes,Cluster_KMeans
2404,40.7488,-74.0273,B02512,2014,8,3,6,0,3,0
2405,40.7714,-73.9821,B02512,2014,8,3,6,0,4,0
2406,40.7567,-73.9887,B02512,2014,8,3,6,0,6,0
2407,40.7176,-73.9576,B02512,2014,8,3,6,0,7,0
2408,40.7198,-73.9953,B02512,2014,8,3,6,0,8,0


In [165]:
def plot_map_kmeans(i):
    fig = px.scatter_mapbox(
        dict_df_per_day[f"data_aug_day_{i}"],
        lat="Lat",
        lon="Lon",
        color="Cluster_KMeans",
        mapbox_style="carto-positron",
        zoom=9,
        height=600,
        width=800,
        title=f"Clustering map of hot-zones for the DayOfWeek {i}"
    )

    fig.show()


In [166]:
plot_map_kmeans(0)

In [167]:
plot_map_kmeans(1)

In [168]:
plot_map_kmeans(2)

In [169]:
plot_map_kmeans(3)

In [170]:
plot_map_kmeans(4)

In [171]:
plot_map_kmeans(5)

In [172]:
plot_map_kmeans(6)

## With DBSCAN

In [173]:
for i in range(7):

    db = DBSCAN(eps=0.05, min_samples=4, metric="euclidean")
    db.fit(X_dict[f"X_{i}"])

    dict_df_per_day[f"data_aug_day_{i}"].loc[:,'Cluster_DBSCAN'] = db.labels_
    display(data_aug_selected.head())

Unnamed: 0,Lat,Lon,Base,Years,Months,Day,DayOfWeek,Hours,Minutes,Cluster_KMeans,Cluster_DBSCAN
7714,40.7793,-73.9554,B02512,2014,8,7,3,17,0,3,0
7715,40.7634,-73.9806,B02512,2014,8,7,3,17,0,3,0
7716,40.7624,-73.9855,B02512,2014,8,7,3,17,2,3,0
7717,40.7427,-73.9867,B02512,2014,8,7,3,17,3,3,0
7718,40.7548,-73.9886,B02512,2014,8,7,3,17,3,3,0


Unnamed: 0,Lat,Lon,Base,Years,Months,Day,DayOfWeek,Hours,Minutes,Cluster_KMeans,Cluster_DBSCAN
7714,40.7793,-73.9554,B02512,2014,8,7,3,17,0,3,0
7715,40.7634,-73.9806,B02512,2014,8,7,3,17,0,3,0
7716,40.7624,-73.9855,B02512,2014,8,7,3,17,2,3,0
7717,40.7427,-73.9867,B02512,2014,8,7,3,17,3,3,0
7718,40.7548,-73.9886,B02512,2014,8,7,3,17,3,3,0


Unnamed: 0,Lat,Lon,Base,Years,Months,Day,DayOfWeek,Hours,Minutes,Cluster_KMeans,Cluster_DBSCAN
7714,40.7793,-73.9554,B02512,2014,8,7,3,17,0,3,0
7715,40.7634,-73.9806,B02512,2014,8,7,3,17,0,3,0
7716,40.7624,-73.9855,B02512,2014,8,7,3,17,2,3,0
7717,40.7427,-73.9867,B02512,2014,8,7,3,17,3,3,0
7718,40.7548,-73.9886,B02512,2014,8,7,3,17,3,3,0


Unnamed: 0,Lat,Lon,Base,Years,Months,Day,DayOfWeek,Hours,Minutes,Cluster_KMeans,Cluster_DBSCAN
7714,40.7793,-73.9554,B02512,2014,8,7,3,17,0,3,0
7715,40.7634,-73.9806,B02512,2014,8,7,3,17,0,3,0
7716,40.7624,-73.9855,B02512,2014,8,7,3,17,2,3,0
7717,40.7427,-73.9867,B02512,2014,8,7,3,17,3,3,0
7718,40.7548,-73.9886,B02512,2014,8,7,3,17,3,3,0


Unnamed: 0,Lat,Lon,Base,Years,Months,Day,DayOfWeek,Hours,Minutes,Cluster_KMeans,Cluster_DBSCAN
7714,40.7793,-73.9554,B02512,2014,8,7,3,17,0,3,0
7715,40.7634,-73.9806,B02512,2014,8,7,3,17,0,3,0
7716,40.7624,-73.9855,B02512,2014,8,7,3,17,2,3,0
7717,40.7427,-73.9867,B02512,2014,8,7,3,17,3,3,0
7718,40.7548,-73.9886,B02512,2014,8,7,3,17,3,3,0


Unnamed: 0,Lat,Lon,Base,Years,Months,Day,DayOfWeek,Hours,Minutes,Cluster_KMeans,Cluster_DBSCAN
7714,40.7793,-73.9554,B02512,2014,8,7,3,17,0,3,0
7715,40.7634,-73.9806,B02512,2014,8,7,3,17,0,3,0
7716,40.7624,-73.9855,B02512,2014,8,7,3,17,2,3,0
7717,40.7427,-73.9867,B02512,2014,8,7,3,17,3,3,0
7718,40.7548,-73.9886,B02512,2014,8,7,3,17,3,3,0


Unnamed: 0,Lat,Lon,Base,Years,Months,Day,DayOfWeek,Hours,Minutes,Cluster_KMeans,Cluster_DBSCAN
7714,40.7793,-73.9554,B02512,2014,8,7,3,17,0,3,0
7715,40.7634,-73.9806,B02512,2014,8,7,3,17,0,3,0
7716,40.7624,-73.9855,B02512,2014,8,7,3,17,2,3,0
7717,40.7427,-73.9867,B02512,2014,8,7,3,17,3,3,0
7718,40.7548,-73.9886,B02512,2014,8,7,3,17,3,3,0


In [176]:
def plot_map_dbscan(i):
    fig = px.scatter_mapbox(
        dict_df_per_day[f"data_aug_day_{i}"][dict_df_per_day[f"data_aug_day_{i}"].Cluster_DBSCAN != -1],
        lat="Lat",
        lon="Lon",
        color="Cluster_DBSCAN",
        mapbox_style="carto-positron",
        zoom=8,
        height=600,
        width=800,
        title=f"Clustering map of hot-zones for the DayOfWeek {i}"
    )
    fig.show()


In [177]:
plot_map_dbscan(0)

In [178]:
plot_map_dbscan(1)

In [179]:
plot_map_dbscan(2)

In [180]:
plot_map_dbscan(3)

In [181]:
plot_map_dbscan(4)

In [182]:
plot_map_dbscan(5)

In [183]:
plot_map_dbscan(6)

# 6) Conclusion

The 2 methods give clusters. But the Kmeans methode gives more distinguish cluster, the more there are points in one cluster the more it is a hot zone for the day of week. The DBSCAN method does one big cluster in the middle (due to the method based on the density of point) and few little clusters further.

In my opinion, for this project, I think the KMeans method is better to clustering zones to know where there is an high needing in uber cars.