In [None]:
# Librerías necesarias
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from scipy.cluster.hierarchy import cluster_dendrogram, linkage
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Importar el conjunto de datos de clientes
customer_data = pd.read_csv(r'C:\Users\neide\Desktop\HDD\UNIVERSIDAD\6 SEMESTRE\1- ANALISIS DE DATOS\4-ALGORITMOS DE APRENDIZAJE NO SUPERVISADO\Anexo 5 - Dataset Mall Customer Segmentation\Mall_Customers.csv')

In [None]:
customer_data.head(35)

In [None]:
customer_data.describe()

In [None]:
# Inspeccionar los datos cargados
print(customer_data.info())  # Información general
print(customer_data.describe())  # Resumen estadístico

In [None]:
# Detección visual de valores extremos
plt.figure(figsize=(12, 6))
for i, col in enumerate(['Age', 'Annual Income (k$)', 'Spending Score (1-100)'], 1):
    plt.subplot(1, 3, i)
    sns.boxplot(x=customer_data[col], color='blue')
    plt.title(f'Valores Atípicos en {col}')
plt.tight_layout()
plt.show()

In [None]:
# Selección de columnas relevantes
customer_data = customer_data[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']]


In [None]:
# Normalizar las variables seleccionadas
data_scaler = StandardScaler()
customer_data_scaled = data_scaler.fit_transform(customer_data)


In [None]:
# Visualización jerárquica para determinar agrupaciones óptimas
linkage_matrix = linkage(customer_data_scaled, method='ward')
plt.figure(figsize=(10, 7))
cluster_dendrogram(linkage_matrix, orientation='top', distance_sort='descending', show_leaf_counts=True)
plt.title('Dendrograma - Clustering Jerárquico')
plt.xlabel('Puntos de datos')
plt.ylabel('Distancia Euclidiana')
plt.show()

In [None]:
# Seleccionar el número óptimo de clusters (ajustar según el cluster_dendrograma)
num_clusters = 4

In [None]:
# Aplicar el modelo de agrupamiento jerárquico
cluster_model = AgglomerativeClustering(n_clusters=num_clusters, metric='euclidean', linkage='ward')
customer_data['Hierarchical_Cluster'] = cluster_model.fit_predict(customer_data_scaled)

In [None]:
# Calcular métricas de evaluación de agrupación
silhouette_cluster_model = silhouette_score(customer_data_scaled, customer_data['Hierarchical_Cluster'])
calinski_cluster_model = calinski_harabasz_score(customer_data_scaled, customer_data['Hierarchical_Cluster'])
davies_cluster_model = davies_bouldin_score(customer_data_scaled, customer_data['Hierarchical_Cluster'])

In [None]:
# Representar gráficamente las agrupaciones
plt.figure(figsize=(10, 7))
sns.scatterplot(x=customer_data_scaled[:, 1], y=customer_data_scaled[:, 2], hue=customer_data['Hierarchical_Cluster'], palette='viridis', s=60)
plt.title('Clustering Jerárquico - Visualización de Clusters')
plt.xlabel('Ingresos Anuales (Escalados)')
plt.ylabel('Puntaje de Gasto (Escalado)')
plt.show()

In [None]:
# Imprimir los resultados obtenidos
print(f"Silhouette Score: {silhouette_cluster_model:.2f}")
print(f"Calinski-Harabasz Index: {calinski_cluster_model:.2f}")
print(f"Davies-Bouldin Index: {davies_cluster_model:.2f}")