# Clase 10
## Modelos de aprendizaje no supervisado

### K-means


In [None]:
# Importar librerías
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

import plotly.express as px
import plotly.graph_objects as go


In [None]:
# Cargar el dataset
df = pd.read_csv('Mall_Customers.csv')
df.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [None]:
# Condificación de variables categóricas
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})


In [None]:
df.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,0,19,15,39
1,2,0,21,15,81
2,3,1,20,16,6
3,4,1,23,16,77
4,5,1,31,17,40


In [None]:
# Preparación de datos
x = df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']]

In [None]:
x.describe()

Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100)
count,200.0,200.0,200.0
mean,38.85,60.56,50.2
std,13.969007,26.264721,25.823522
min,18.0,15.0,1.0
25%,28.75,41.5,34.75
50%,36.0,61.5,50.0
75%,49.0,78.0,73.0
max,70.0,137.0,99.0


In [None]:
# Escalado de características
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [None]:
x_scaled

array([[-1.42456879, -1.73899919, -0.43480148],
       [-1.28103541, -1.73899919,  1.19570407],
       [-1.3528021 , -1.70082976, -1.71591298],
       [-1.13750203, -1.70082976,  1.04041783],
       [-0.56336851, -1.66266033, -0.39597992],
       [-1.20926872, -1.66266033,  1.00159627],
       [-0.27630176, -1.62449091, -1.71591298],
       [-1.13750203, -1.62449091,  1.70038436],
       [ 1.80493225, -1.58632148, -1.83237767],
       [-0.6351352 , -1.58632148,  0.84631002],
       [ 2.02023231, -1.58632148, -1.4053405 ],
       [-0.27630176, -1.58632148,  1.89449216],
       [ 1.37433211, -1.54815205, -1.36651894],
       [-1.06573534, -1.54815205,  1.04041783],
       [-0.13276838, -1.54815205, -1.44416206],
       [-1.20926872, -1.54815205,  1.11806095],
       [-0.27630176, -1.50998262, -0.59008772],
       [-1.3528021 , -1.50998262,  0.61338066],
       [ 0.94373197, -1.43364376, -0.82301709],
       [-0.27630176, -1.43364376,  1.8556706 ],
       [-0.27630176, -1.39547433, -0.590

#### Determinación del número óptimo de clusters
El número de clústeres en K-Means no viene dado por el algoritmo; se elige combinando criterios estadísticos y de negocio.

**Método del codo**
Se ejecuta K-Means para varios valores de k (por ejemplo, de 1 a 10) y se calcula la suma de cuadrados intra‑clúster (inercia o WCSS) para cada k.

**Coeficiente de silueta**
Para cada k, se calcula el coeficiente de silueta medio, que mide qué tan bien queda cada punto dentro de su clúster en comparación con los demás clústeres. Se suele escoger el k que maximiza la silueta promedio, siempre que también tenga sentido desde el punto de vista del problema.

In [None]:
from sklearn.cluster import KMeans

In [None]:
# Método del codo
inercia = []
k_rango = range(1, 11)

for k in k_rango:
  kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
  kmeans.fit(x_scaled)
  inercia_actual = kmeans.inertia_
  inercia.append(inercia_actual)

inercia

[599.9999999999999,
 389.38618895643714,
 295.21224615554877,
 205.22514747675913,
 168.2475801755683,
 133.86842085478855,
 117.01155455889815,
 103.87329152383714,
 93.0928911004172,
 82.38515364526597]

In [None]:
# Graficar inercia
fig = px.line(
    x=list(k_rango),
    y=inercia,
    markers=True,
    title='Método del codo',
    labels={'x': 'Número de clústers', 'y':'Inercia'}
)
fig.show()

In [None]:
# Método de Coeficiente de silueta
from sklearn.metrics import silhouette_score

silueta = []

for k in range(2, 11):
  kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
  labels = kmeans.fit_predict(x_scaled)
  silueta.append(silhouette_score(x_scaled, labels))

silueta


[np.float64(0.33547192894004574),
 np.float64(0.357793388710272),
 np.float64(0.4039582785148566),
 np.float64(0.41664341513732767),
 np.float64(0.4284167762892593),
 np.float64(0.417231894954916),
 np.float64(0.4082067042807375),
 np.float64(0.41769250624076476),
 np.float64(0.40655411010117015)]

In [None]:
# Graficar
fig = px.line(
    x=range(2,11),
    y=silueta,
    title='Método del Coeficiende de silueta',
    labels={'x': 'Número de Clústers', 'y':'Coeficiente de silueta'}
)
fig.show()

In [None]:
# Entrenamiento del modelo
kmeans = KMeans(n_clusters=6, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(x_scaled)

In [None]:
# Visualización de clústers
fig = px.scatter(
    df,
    x='Annual Income (k$)',
    y='Spending Score (1-100)',
    color='cluster',
    title='Clusters de clientes'
)

fig.show()

In [None]:
# Análisis de los clústeres
cluster_analisis = df.groupby('cluster')[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].mean()
cluster_analisis

Unnamed: 0_level_0,Age,Annual Income (k$),Spending Score (1-100)
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,56.333333,54.266667,49.066667
1,26.794872,57.102564,48.128205
2,41.939394,88.939394,16.969697
3,32.692308,86.538462,82.128205
4,25.0,25.26087,77.608696
5,45.52381,26.285714,19.380952


In [None]:
# Evaluación del modelo
# > 0.5: clustering bien definido
# 0.25 – 0.5: clustering aceptable
# < 0.25: clusters poco definidos

silhouette_score = silhouette_score(x_scaled, df['cluster'])
silhouette_score

np.float64(0.4284167762892593)