In [78]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import plotly.express as px
from plotly.subplots import make_subplots

data = pd.read_csv('../data/echantillon.csv')
print(data.head())
print(data.shape)

   Unnamed: 0            Date/Time      Lat      Lon    Base        date  \
0           0  2014-04-01 00:11:00  40.7690 -73.9549  B02512  2014-04-01   
1           1  2014-04-01 00:17:00  40.7267 -74.0345  B02512  2014-04-01   
2           2  2014-04-01 00:21:00  40.7316 -73.9873  B02512  2014-04-01   
3           3  2014-04-01 00:28:00  40.7588 -73.9776  B02512  2014-04-01   
4           4  2014-04-01 00:33:00  40.7594 -73.9722  B02512  2014-04-01   

       time  annee  mois  jour  jour_semaine  heure  minutes  secondes  
0  00:11:00   2014     4     1             1      0       11         0  
1  00:17:00   2014     4     1             1      0       17         0  
2  00:21:00   2014     4     1             1      0       21         0  
3  00:28:00   2014     4     1             1      0       28         0  
4  00:33:00   2014     4     1             1      0       33         0  
(564516, 14)


In [79]:
print('Reduction du dataset sur une période :')
data_reduc = data[(data["jour_semaine"] == 1) & (data["heure"] == 12)].copy()
print(data_reduc.shape)

lat_lon = data_reduc[['Lat','Lon']]
sc = StandardScaler()
lat_lon = sc.fit_transform(lat_lon)

kmeans = KMeans(n_clusters=15, random_state=0)
cluster = kmeans.fit_predict(lat_lon)
print(len(cluster))
data_reduc["kmeans_cluster"] = cluster
print(data_reduc.head())

Reduction du dataset sur une période :
(2819, 14)
2819
     Unnamed: 0            Date/Time      Lat      Lon    Base        date  \
344         344  2014-04-01 12:02:00  40.7852 -74.0220  B02512  2014-04-01   
345         345  2014-04-01 12:02:00  40.7852 -74.0220  B02512  2014-04-01   
346         346  2014-04-01 12:04:00  40.6878 -74.1817  B02512  2014-04-01   
347         347  2014-04-01 12:07:00  40.6864 -73.9747  B02512  2014-04-01   
348         348  2014-04-01 12:10:00  40.8289 -73.9451  B02512  2014-04-01   

         time  annee  mois  jour  jour_semaine  heure  minutes  secondes  \
344  12:02:00   2014     4     1             1     12        2         0   
345  12:02:00   2014     4     1             1     12        2         0   
346  12:04:00   2014     4     1             1     12        4         0   
347  12:07:00   2014     4     1             1     12        7         0   
348  12:10:00   2014     4     1             1     12       10         0   

     kmeans_cluster

In [80]:
fig = px.scatter_map(data_reduc,lat="Lat",lon="Lon",color='kmeans_cluster',width=850)
fig.show()

In [81]:
print("optimisation du nombre de cluster")
# utilisation du template du cours JEDHA :
wcss =  []
k = []
for i in range (1,20):
    kmeans_test = KMeans(n_clusters= i, random_state = 0, n_init = 'auto')
    kmeans_test.fit(lat_lon)
    wcss.append(kmeans_test.inertia_)
    k.append(i)
    print("WCSS for K={} --> {}".format(i, wcss[-1]))

optimisation du nombre de cluster
WCSS for K=1 --> 5638.000000000001
WCSS for K=2 --> 4443.311817175586
WCSS for K=3 --> 3002.9804407774454
WCSS for K=4 --> 2313.123908599363
WCSS for K=5 --> 1745.8744288096686
WCSS for K=6 --> 1434.85393657962
WCSS for K=7 --> 1300.7004136823925
WCSS for K=8 --> 1072.4218538367206
WCSS for K=9 --> 983.9160860299153
WCSS for K=10 --> 836.2814272638458
WCSS for K=11 --> 700.9265585668861
WCSS for K=12 --> 634.6178493489996
WCSS for K=13 --> 568.1687908892231
WCSS for K=14 --> 542.2607734762923
WCSS for K=15 --> 453.3491833908372
WCSS for K=16 --> 413.3268145994058
WCSS for K=17 --> 385.3248658893364
WCSS for K=18 --> 368.20920172057674
WCSS for K=19 --> 345.51086750639854


In [82]:
print('12 à 13 clusters possibles')

12 à 13 clusters possibles


On essaye de voir les Hot Zones en fonction du jour de la semaines

In [None]:
days = dict()
for i in data['jour_semaine'].value_counts().index.sort_values():
    datajour = data[(data["jour_semaine"] == i) & (data["heure"] == 12)].copy()
    lat_lon_jour = datajour[['Lat','Lon']]
    lat_lon_jour = sc.transform(lat_lon_jour)
    cluster = kmeans.predict(lat_lon_jour)
    datajour['cluster'] = cluster
    days[f"jour_semaine{i}"] = datajour


subplot = make_subplots(rows=7,cols=1)
for index, a in enumerate(days,start=1):
    map = px.scatter_map(days[a][['cluster','Lat','Lon']],lat='Lat',lon='Lon',color='cluster',width=850,zoom=10,title=f"jour de semaine n°{index}")
    map.show()