In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.neighbors import NearestNeighbors

In [None]:
# Carregar os dados
data = pd.read_csv("../data_source/amostra_total.csv", sep=';')
data = data[["INDICE", "LATITUDE", "LONGITUDE", "LOGRADOURO", "NUMERO"]]

In [None]:
# Parâmetros
first_n_clusters = 42
subclusters_per_leiturista = 22
min_points_per_cluster = 350
max_points_per_cluster = 450

In [None]:
# Clusterização inicial
kmeans = MiniBatchKMeans(n_clusters=first_n_clusters, random_state=8081)
data['LEITURISTA'] = kmeans.fit_predict(data[['LATITUDE', 'LONGITUDE']])

In [None]:
# Função para subclusterização com ajuste local
def adjust_clusters_locally(data, min_points, max_points):
    subcluster_id = 0
    for leiturista in range(first_n_clusters):
        subcluster_data = data[data['LEITURISTA'] == leiturista]
        if len(subcluster_data) <= max_points:
            data.loc[subcluster_data.index, 'SUBCLUSTER'] = subcluster_id
            subcluster_id += 1
            continue
        
        kmeans_22 = MiniBatchKMeans(n_clusters=int(np.ceil(len(subcluster_data) / max_points)), random_state=8081)
        subclusters = kmeans_22.fit_predict(subcluster_data[['LATITUDE', 'LONGITUDE']])
        
        subcluster_sizes = pd.Series(subclusters).value_counts()
        small_subclusters = subcluster_sizes[subcluster_sizes < min_points].index.tolist()
        
        for subcluster in np.unique(subclusters):
            indices = subcluster_data.index[subclusters == subcluster]
            if len(indices) >= min_points:
                data.loc[indices, 'SUBCLUSTER'] = subcluster_id
                subcluster_id += 1
            else:
                if len(subcluster_data[subcluster_data['SUBCLUSTER'] != -1]) > 0:
                    nearest_subcluster = subcluster_data[subcluster_data['SUBCLUSTER'] != -1]['SUBCLUSTER'].values
                    nearest_points = subcluster_data[subcluster_data['SUBCLUSTER'] != -1][['LATITUDE', 'LONGITUDE']].values
                    nearest = pairwise_distances_argmin_min(subcluster_data.loc[[idx], ['LATITUDE', 'LONGITUDE']], nearest_points)
                    data.loc[idx, 'SUBCLUSTER'] = nearest_subcluster[nearest[0][0]]
                else:
                    data.loc[indices, 'SUBCLUSTER'] = subcluster_id
                    subcluster_id += 1
        
    return data

In [None]:
# Ajustar clusters localmente
data['SUBCLUSTER'] = -1
data = adjust_clusters_locally(data, min_points_per_cluster, max_points_per_cluster)

In [None]:
# Função para categorizar a quantidade de pontos em cada intervalo
def categorize_points(count):
    intervals = {
        'Muito Abaixo (menos de 100)': (0, 99),
        'Abaixo (100-349)': (100, 349),
        'Dentro da Média (350-450)': (350, 450),
        'Acima (451-800)': (451, 800),
        'Muito Acima (mais de 800)': (801, float('inf'))
    }
    for category, (low, high) in intervals.items():
        if low <= count <= high:
            return category
    return None

In [None]:
# Calcular as estatísticas
stats = []
subcluster_counts = data['SUBCLUSTER'].value_counts()

for subcluster, count in subcluster_counts.items():
    stats.append({
        'SUBCLUSTER': subcluster,
        'COUNT': count,
        'CATEGORY': categorize_points(count)
    })

stats_df = pd.DataFrame(stats)

In [None]:
# Contagem de subclusters dentro de cada intervalo
category_counts = stats_df['CATEGORY'].value_counts().to_dict()

In [None]:
# Adicionar categorias que não estão presentes no DataFrame
intervals = {
    'Muito Abaixo (menos de 100)': (0, 99),
    'Abaixo (100-349)': (100, 349),
    'Dentro da Média (350-450)': (350, 450),
    'Acima (451-800)': (451, 800),
    'Muito Acima (mais de 800)': (801, float('inf'))
}

for category in intervals.keys():
    if category not in category_counts:
        category_counts[category] = 0

In [None]:
print("Contagem de subclusters dentro de cada intervalo após redistribuição:")
for category, count in category_counts.items():
    print(f"{category}: {count}")

In [None]:
# Salvar as novas estatísticas em um arquivo CSV
stats_df.to_csv(f'../cluster/{first_n_clusters}_clusters_stats_adjusted_with_categories.csv', index=False)

In [None]:
# Plotar os clusters ajustados
def plot_clusters_sns(df, num_clusters, cluster_col, title, filename):
    plt.figure(figsize=(10, 6))
    palette = sns.color_palette("hsv", num_clusters)
    sns.scatterplot(data=df, x='LONGITUDE', y='LATITUDE', hue=cluster_col, palette=palette, s=50, legend=None)
    plt.title(title)
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.savefig(filename)
    plt.close()

In [None]:
plot_clusters_sns(data, first_n_clusters, 'SUBCLUSTER', f'{first_n_clusters} Clusters Ajustados', f'../cluster/{first_n_clusters}_subclusters_ajustados.png')