In [None]:
import pandas as pd
import plotly.express as px
from sklearn.cluster import MiniBatchKMeans
from haversine import haversine, Unit
import plotly.graph_objects as go

# Reading the data
df = pd.read_csv("../data/amostra_total.csv", sep=';')

coords = df[['LATITUDE', 'LONGITUDE']].values

# Parameters for MiniBatchKMeans
n_clusters = 150
batch_size = 1000
total_days = 22

kmeans = MiniBatchKMeans(n_clusters=(n_clusters * total_days), batch_size=batch_size, random_state=42)
labels = kmeans.fit_predict(coords)
df['Cluster'] = labels

# Calculate the centroids
centroids = kmeans.cluster_centers_

# Calculate the number of points and the radius (maximum distance) for each centroid
cluster_info = []
for i in range(len(centroids)):
    cluster_points = coords[labels == i]
    num_points = len(cluster_points)
    max_radius = max([haversine(centroids[i], point, unit=Unit.KILOMETERS) for point in cluster_points])
    cluster_info.append([centroids[i][0], centroids[i][1], num_points, max_radius])

# Create a DataFrame with the cluster information
df_clusters = pd.DataFrame(cluster_info, columns=['LATITUDE', 'LONGITUDE', 'NUM_POINTS', 'RADIUS'])

# Interactive plot of clusters on the map
fig_clusters = px.scatter_mapbox(df_clusters, lat='LATITUDE', lon='LONGITUDE', size='RADIUS', 
                                 color='NUM_POINTS', mapbox_style="carto-positron", zoom=10,
                                 title="Cluster Centroids on the Map",
                                 hover_data={'LATITUDE': False, 'LONGITUDE': False, 
                                             'NUM_POINTS': True, 'RADIUS': True})

fig_clusters.update_traces(marker=dict(sizemin=4))
fig_clusters.show()

# Calculate the average radius of the centroids and the average number of points
average_radius = df_clusters['RADIUS'].mean()
average_num_points = df_clusters['NUM_POINTS'].mean()

# Plot to visualize the results
fig_stats = go.Figure()

fig_stats.add_trace(go.Indicator(
    mode="number",
    value=average_radius,
    title={"text": "Average Radius of Centroids (km)"},
    domain={'row': 0, 'column': 0}
))

fig_stats.add_trace(go.Indicator(
    mode="number",
    value=average_num_points,
    title={"text": "Average Number of Points per Cluster"},
    domain={'row': 0, 'column': 1}
))

fig_stats.update_layout(
    grid={'rows': 1, 'columns': 2, 'pattern': "independent"},
    title="Cluster Statistics"
)

fig_stats.show()


In [None]:
fig_points = px.histogram(df_clusters, x='NUM_POINTS', nbins=50, title='Distribution of Points per Cluster')
fig_points.update_layout(xaxis_title='Number of Points', yaxis_title='Frequency')
fig_points.show()

fig_scatter = px.scatter(df_clusters, x='NUM_POINTS', y='RADIUS', 
                         title='Points per Cluster vs. Radius per Cluster',
                         labels={'NUM_POINTS': 'Number of Points', 'RADIUS': 'Radius (km)'})
fig_scatter.update_layout(xaxis_title='Number of Points', yaxis_title='Radius (km)')
fig_scatter.show()