In [5]:
from clustering_utils import run_full_benchmark
from clustering_utils import prepare_scaled_df
from clustering_utils.reporting import (
    export_top_cluster_reports,
    export_single_cluster_report
)
from clustering_utils import generate_all_cluster_plots
from clustering_utils import generate_top_cluster_visuals
from itertools import combinations, product
import pandas as pd

df = pd.read_csv("/Users/marksonrebelomarcolino/Downloads/spg_uf_selec_censo_2023.csv", low_memory=False)
#df = pd.read_parquet("/Users/marksonrebelomarcolino/Documents/gestao_presente/2025/Fevereiro/artigo/base_final_sgp_censo_artigo.parquet")

# 1. Remover colunas com mais de 50% de valores nulos
threshold = 0.1
null_percentages = df.isnull().mean()
cols_to_drop = null_percentages[null_percentages > threshold].index.tolist()
df_cleaned = df.drop(columns=cols_to_drop)
# 1. Carregar apenas colunas numéricas do df_cleaned
numeric_df = df_cleaned.select_dtypes(include=["number"])

# 2. Remover linhas com valores nulos
df_final = numeric_df.dropna()

# Definir colunas
colunas_para_normalizar = [
    'pp_AbsPequeno', 'pp_AbsModerado', 'pp_AbsSignificativo', 'pp_AbsAlto', 'pp_AbsSevero',
    'Estrutura', 'PED', 'LibrasBraile', 'TratamentoLixo', 'PCD',
    'Acessibilidade', 'Internet', 'Transporte', 'TP_LOCALIZACAO'
]

colunas_sem_normalizar = [
    'media_freq_media_valida_pct',  
    'desvio_padrao_freq_media_valida_pct'
]

# Define parameters
benchmark_params = {
    "algorithms": ["KMeans", "MeanShift"],
    "cluster_range": range(3,5),
    "verbose": False,
    "top_n": 5,
    "return_best": False
}

df_scaled = prepare_scaled_df(
    df_final,
    cols_to_keep=colunas_sem_normalizar,
    scaler_type="minmax",
    minmax_range=(0, 5)
)

[Scaling] Columns to scale: ['nu_inep', 'AbsAlto', 'AbsModerado', 'AbsPequeno', 'AbsSevero', 'AbsSignificativo', 'qtd_alunos_escola', 'pp_AbsPequeno', 'pp_AbsModerado', 'pp_AbsSignificativo', 'pp_AbsAlto', 'pp_AbsSevero', 'indice_meses_reportados', 'cod_uf', 'CO_UF', 'CO_MUNICIPIO', 'CO_ENTIDADE', 'TP_DEPENDENCIA', 'TP_LOCALIZACAO', 'TP_LOCALIZACAO_DIFERENCIADA', 'CO_CEP', 'NU_DDD', 'NU_TELEFONE', 'TP_SITUACAO_FUNCIONAMENTO', 'IN_VINCULO_SECRETARIA_EDUCACAO', 'IN_VINCULO_SEGURANCA_PUBLICA', 'IN_VINCULO_SECRETARIA_SAUDE', 'IN_VINCULO_OUTRO_ORGAO', 'IN_PODER_PUBLICO_PARCERIA', 'TP_REGULAMENTACAO', 'TP_RESPONSAVEL_REGULAMENTACAO', 'IN_LOCAL_FUNC_PREDIO_ESCOLAR', 'TP_OCUPACAO_PREDIO_ESCOLAR', 'IN_LOCAL_FUNC_SOCIOEDUCATIVO', 'IN_LOCAL_FUNC_UNID_PRISIONAL', 'IN_LOCAL_FUNC_PRISIONAL_SOCIO', 'IN_LOCAL_FUNC_GALPAO', 'IN_LOCAL_FUNC_SALAS_OUTRA_ESC', 'IN_LOCAL_FUNC_OUTROS', 'IN_PREDIO_COMPARTILHADO', 'IN_AGUA_POTAVEL', 'IN_AGUA_REDE_PUBLICA', 'IN_AGUA_POCO_ARTESIANO', 'IN_AGUA_CACIMBA', 'IN_AGUA_

In [6]:
df_scaled["media_freq_media_valida_pct"]

0       90.16
2       90.14
3       90.31
4       90.31
5       90.31
        ...  
4239    62.67
4240    84.14
4241    83.97
4242    63.59
4243    61.85
Name: media_freq_media_valida_pct, Length: 3888, dtype: float64

In [7]:
results = run_full_benchmark(df_scaled, **benchmark_params)

In [10]:
results

Unnamed: 0,Algorithm,Params,SilhouetteScore,NumClusters,ExecutionTime
0,KMeans,{'n_clusters': 3},0.150321,3,0.37846
2,MeanShift,{},0.149077,9,52.16909
1,KMeans,{'n_clusters': 4},0.118281,4,0.391095


In [8]:
generate_top_cluster_visuals(df_scaled, results, top_n=3)

  warn(
  warn(
  warn(


In [9]:
export_top_cluster_reports(df_scaled, results, top_n=3)

 Generating report for Top 1: KMeans with {'n_clusters': 3}
✔ Cluster counts exported to cluster_reports/top_3_20250402_163506/1_KMeans_n_clusters=3/counts.csv
✔ Cluster summary exported to cluster_reports/top_3_20250402_163506/1_KMeans_n_clusters=3/summary.csv
✔ PCA components exported to cluster_reports/top_3_20250402_163506/1_KMeans_n_clusters=3/pca.csv
✔ Cluster centroids exported to cluster_reports/top_3_20250402_163506/1_KMeans_n_clusters=3/centroids.csv
 Generating report for Top 2: MeanShift with {}
✔ Cluster counts exported to cluster_reports/top_3_20250402_163506/2_MeanShift_default/counts.csv
✔ Cluster summary exported to cluster_reports/top_3_20250402_163506/2_MeanShift_default/summary.csv
✔ PCA components exported to cluster_reports/top_3_20250402_163506/2_MeanShift_default/pca.csv
✔ Cluster centroids exported to cluster_reports/top_3_20250402_163506/2_MeanShift_default/centroids.csv
 Generating report for Top 3: KMeans with {'n_clusters': 4}
✔ Cluster counts exported to c