In [1]:
from ucimlrepo import fetch_ucirepo, list_available_datasets
  
# fetch dataset 
census_income = fetch_ucirepo(id=20) 
  
# data (as pandas dataframes) 
X = census_income.data.features 
y = census_income.data.targets

In [None]:
from sklearn.metrics import silhouette_score, davies_bouldin_score
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score


df = X

In [None]:
df.info()

In [None]:
df.drop('native-country', axis=1, inplace=True)
df.drop('fnlwgt', axis=1, inplace=True)
df.drop('occupation', axis=1, inplace=True)
df.drop('workclass', axis=1, inplace=True)

In [None]:
# 1. Tratar valores ausentes
# Imputação da moda para variáveis categóricas
categorical_cols = df.select_dtypes(include=['object']).columns
imputer_cat = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = imputer_cat.fit_transform(df[categorical_cols])

In [None]:
# Imputação da média para variáveis numéricas
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
imputer_num = SimpleImputer(strategy='mean')
df[numerical_cols] = imputer_num.fit_transform(df[numerical_cols])

In [None]:
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)
df_scaled = pd.DataFrame(df_scaled)

In [None]:
df_scaled

In [None]:
#dist_matrix = pairwise_distances(df_scaled)

In [None]:
# Aplicando o corte com a distância desejada (exemplo: 10)
cut_distance = 100
model = AgglomerativeClustering(distance_threshold=cut_distance, n_clusters=None, metric='euclidean', linkage='ward')
model.fit(df_scaled)

Z = linkage(df_scaled, method='ward')

# Plotando o dendrograma para ajudar a visualizar o corte
plt.figure(figsize=(10, 12))
plt.title("Dendrograma")
plt.xlabel("Pontos de Dados")
plt.ylabel("Distância")
plt.axhline(y=cut_distance, color='r', linestyle='--')  # Corte da distância
dendrogram(Z, truncate_mode="level", p=3, color_threshold=10)
plt.show()

# Quantidade de clusters após o corte
num_clusters = len(np.unique(model.labels_))
print(f"Número de clusters encontrados: {num_clusters}")

In [None]:
# Cálculo das métricas
silhouette_avg = silhouette_score(df_scaled, model.labels_)
calinski_harabasz = calinski_harabasz_score(df_scaled, model.labels_)
davies_bouldin = davies_bouldin_score(df_scaled, model.labels_)

    # Exibe as métricas
print(f"Média do Silhouette Score: {silhouette_avg:.4f}")
print(f"Índice Calinski-Harabasz: {calinski_harabasz:.4f}")
print(f"Índice Davies-Bouldin: {davies_bouldin:.4f}")