In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/2025/Sistemas Inteligentes Avançados/Models/diabetic_data.csv', sep=',')


In [34]:
from collections import Counter
import numpy as np

# Primeiro, substituir os '?' por NaN
df.replace('?', np.nan, inplace=True)
df.drop(columns=['diag_1', 'diag_2', 'diag_3'], inplace=True, errors='ignore')

# Definir um limite de tolerância para "muitos" NaNs
# Exemplo: se mais de 30% dos valores estão faltando, dropa
limite_na = 0.5  # 30%


# Dropar colunas com muitos valores NaN
colunas_para_dropar = df.columns[df.isna().mean() > limite_na]
df.drop(columns=colunas_para_dropar, inplace=True)

# Agora, preencher os NaNs restantes
for coluna in df.columns:
    if df[coluna].isna().sum() > 0:
        if df[coluna].dtype == 'object':
            # Preencher categóricas com a moda
            moda = df[coluna].mode()[0]
            df[coluna].fillna(moda, inplace=True)
        else:
            # Preencher numéricas com a média
            media = df[coluna].astype(int).mean()
            df[coluna].fillna(media, inplace=True)

# Encontrar colunas onde um único valor domina
colunas_generalizantes = []

for coluna in df.columns:
    valor_mais_frequente = df[coluna].value_counts(normalize=True).iloc[0]
    if valor_mais_frequente > 0.95:
        colunas_generalizantes.append(coluna)

# Remover essas colunas
df.drop(columns=colunas_generalizantes, inplace=True)

print(f'Colunas generalizantes removidas: {colunas_generalizantes}')
print(f'Número de colunas restantes: {df.shape[1]}')



Colunas generalizantes removidas: []
Número de colunas restantes: 61


In [35]:
# Identificar colunas categóricas
colunas_categoricas = df.select_dtypes(include='object').columns


# Aplicar One-Hot Encoding
df = pd.get_dummies(df, columns=colunas_categoricas)
df.shape

(101766, 61)

In [36]:
from sklearn.preprocessing import MinMaxScaler #normalizar

scaler = MinMaxScaler()
dfNormalizada = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

In [37]:
from sklearn.cluster import KMeans
import math
from scipy.spatial.distance import cdist #Método de determinação de distancias
import numpy as np

# pegando amostra
amostra = dfNormalizada.sample(frac=0.01)
amostra.shape

(1018, 61)

In [38]:
numAmostras = len(amostra)

distortion = []
K= range(1, numAmostras)

for k in K:
    clusterModel = KMeans(n_clusters=k).fit(amostra)

    # calcular distorção
    distortion.append(
        sum(
            np.min(
                cdist(amostra, clusterModel.cluster_centers_, 'euclidean'), axis=1
            )/amostra.shape[0]
        )
    )
distortion

[np.float64(2.573955758601781),
 np.float64(2.4252399437580556),
 np.float64(2.3642445441866333),
 np.float64(2.3030191320970053),
 np.float64(2.252317185342066),
 np.float64(2.2293735584010568),
 np.float64(2.221392531676559),
 np.float64(2.1983634102864746),
 np.float64(2.1653377455150333),
 np.float64(2.1645365590073546),
 np.float64(2.1386201136631),
 np.float64(2.12862284737547),
 np.float64(2.1078994058447402),
 np.float64(2.096719165203055),
 np.float64(2.0908173766878537),
 np.float64(2.0706746550736685),
 np.float64(2.0671851013008733),
 np.float64(2.052711818846176),
 np.float64(2.0565073812144146),
 np.float64(2.0290103249685467),
 np.float64(2.022326520384029),
 np.float64(2.017542641713223),
 np.float64(2.005460565186934),
 np.float64(1.9921376980091132),
 np.float64(1.9878136150533396),
 np.float64(1.986490031261756),
 np.float64(1.977848213789325),
 np.float64(1.9687684291547876),
 np.float64(1.9652130709147884),
 np.float64(1.9537473703810215),
 np.float64(1.95068314768

In [40]:
import math
import numpy as np

x0 = K[0]
y0 = distortion[0]
xn = K[-1]
yn = distortion[-1]
distancias = []

for i in range(len(distortion)):
    x = K[i]
    y = distortion[i]
    numerador = abs((y - y0) * (xn - x0) - (x - x0) * (yn - y0))
    denominador = math.hypot(xn - x0, yn - y0)  # melhor para evitar divisão por zero

    if denominador == 0:
        distancia = 0
    else:
        distancia = numerador / denominador

    distancias.append(distancia)

# Agora encontrar o número ótimo de clusters
numOtimoClusters = K[np.argmax(distancias)]

numOtimoClusters

134

In [46]:
#Treinar

DiabeteClusterModel = KMeans(n_clusters = numOtimoClusters).fit(df)


In [51]:
# Desnormalizar

dfNormalizada['cluster'] = DiabeteClusterModel.labels_

dfDesnormalizado = pd.DataFrame(scaler.inverse_transform(dfNormalizada.drop('cluster', axis=1)), columns=dfNormalizada.drop('cluster', axis=1).columns)
dfDesnormalizado['cluster'] = dfNormalizada['cluster']
dfDesnormalizado.head()

Unnamed: 0,encounter_id,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,...,insulin_Steady,insulin_Up,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes,readmitted_<30,readmitted_>30,readmitted_NO,cluster
0,2278392.0,8222157.0,6.0,25.0,1.0,1.0,41.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,102
1,149190.0,55629189.0,1.0,1.0,7.0,3.0,59.0,0.0,18.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,40
2,64410.0,86047875.0,1.0,1.0,7.0,2.0,11.0,5.0,13.0,2.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,129
3,500364.0,82442376.0,1.0,1.0,7.0,2.0,44.0,1.0,16.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,40
4,16680.0,42519267.0,1.0,1.0,7.0,1.0,51.0,0.0,8.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,84


In [56]:
def inferir_cluster(nova_instancia, scaler, kmeans, dfNormalizada, dfDesnormalizado):
    # 1. Converte a nova instância para DataFrame
    nova_df = pd.DataFrame([nova_instancia])

    # 2. Garante que tenha todas as colunas na mesma ordem
    nova_df = nova_df.reindex(columns=dfNormalizada.drop(columns='cluster').columns, fill_value=0)

    # 3. Normaliza a nova instância
    nova_normalizada = scaler.transform(nova_df)

    # 4. Prediz o cluster
    cluster_predito = kmeans.predict(nova_normalizada)[0]

    print(f"\nA nova instância pertence ao cluster: {cluster_predito}\n")

    # 5. Pega os dados desnormalizados correspondentes
    dados_cluster = dfDesnormalizado[dfDesnormalizado['cluster'] == cluster_predito]

    print("Média dos dados neste cluster (desnormalizados):")
    print(dados_cluster.drop(columns='cluster').mean().round(2))

    print("\nExemplo real desse cluster:")
    print(dados_cluster.drop(columns='cluster').sample(1).T)

In [57]:
#  1. Criar uma nova instância preenchida corretamente
nova_instancia = {
    coluna: 1 if 'gender_' in coluna or 'age_' in coluna or 'race_' in coluna or 'admission_' in coluna else 5
    for coluna in dfNormalizada.drop('cluster', axis=1).columns
}

# 2. Inferir o cluster da nova instância
inferir_cluster(nova_instancia, scaler, DiabeteClusterModel, dfNormalizada, dfDesnormalizado)


A nova instância pertence ao cluster: 102

Média dos dados neste cluster (desnormalizados):
encounter_id                8226717.19
patient_nbr                 3510555.25
admission_type_id                 3.26
discharge_disposition_id         10.40
admission_source_id               4.73
                               ...    
diabetesMed_No                    0.21
diabetesMed_Yes                   0.79
readmitted_<30                    0.09
readmitted_>30                    0.40
readmitted_NO                     0.51
Length: 61, dtype: float64

Exemplo real desse cluster:
                                 119
encounter_id               2087382.0
patient_nbr               15856002.0
admission_type_id                2.0
discharge_disposition_id        11.0
admission_source_id              4.0
...                              ...
diabetesMed_No                   0.0
diabetesMed_Yes                  1.0
readmitted_<30                   0.0
readmitted_>30                   0.0
readmitted_NO  

