# Modelo 3: K-Means Clustering

**Objetivo:** Segmentación de clientes de Telco en clusters

**Algoritmo:** K-Means (k=3)

**Dataset:** Telco Customer Churn (7,043 registros)

## 1. PREPROCESAMIENTO

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import warnings
warnings.filterwarnings('ignore')

# Configurar estilo
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (10, 6)

In [2]:
# Cargar dataset
df = pd.read_csv('../datasets/telco_churn.csv')
print('Dataset Original:')
print(f'Dimensiones: {df.shape}')
print(f'Columnas: {df.columns.tolist()}')

Dataset Original:
Dimensiones: (10, 21)
Columnas: ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']


In [3]:
# Limpieza
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df_clean = df.dropna()

print(f'Dataset después de limpieza: {df_clean.shape}')
print(f'Filas removidas: {df.shape[0] - df_clean.shape[0]}')

Dataset después de limpieza: (10, 21)
Filas removidas: 0


In [4]:
# Codificación de variables categóricas
df_encoded = df_clean.copy()
categorical_cols = df_encoded.select_dtypes(include=['object']).columns

for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])

print(f'Variables categóricas codificadas: {len(categorical_cols)}')
print(f'Dataset codificado: {df_encoded.shape}')

Variables categóricas codificadas: 17
Dataset codificado: (10, 21)


In [5]:
# Usar todos los features para clustering
X = df_encoded.copy()

print(f'Features para clustering: {X.shape}')
print(f'\nEstadísticas básicas:')
print(X.describe().round(2))

Features para clustering: (10, 21)

Estadísticas básicas:
       customerID  gender  SeniorCitizen  Partner  Dependents  tenure  \
count       10.00   10.00           10.0    10.00       10.00   10.00   
mean         4.50    0.50            0.0     0.20        0.20   21.40   
std          3.03    0.53            0.0     0.42        0.42   20.78   
min          0.00    0.00            0.0     0.00        0.00    1.00   
25%          2.25    0.00            0.0     0.00        0.00    3.50   
50%          4.50    0.50            0.0     0.00        0.00   16.00   
75%          6.75    1.00            0.0     0.00        0.00   32.50   
max          9.00    1.00            0.0     1.00        1.00   62.00   

       PhoneService  MultipleLines  InternetService  OnlineSecurity  ...  \
count         10.00          10.00            10.00           10.00  ...   
mean           0.70           0.90             0.40            0.50  ...   
std            0.48           0.88             0.52     

In [6]:
# Normalización (CRÍTICA para K-Means)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print('✓ Datos normalizados')
print(f'Media: {X_scaled.mean(axis=0)[:3]} (primeros 3)')
print(f'Desv. Est.: {X_scaled.std(axis=0)[:3]} (primeros 3)')

✓ Datos normalizados
Media: [1.11022302e-17 0.00000000e+00 0.00000000e+00] (primeros 3)
Desv. Est.: [1. 1. 0.] (primeros 3)


## 2. ENTRENAMIENTO

In [7]:
# Método Elbow para encontrar k óptimo
inertias = []
silhouette_scores = []
K_range = range(2, 11)

for k in K_range:
    kmeans_temp = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans_temp.fit(X_scaled)
    inertias.append(kmeans_temp.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, kmeans_temp.labels_))

print('Evaluación de clusters (Elbow Method):')
for k, inertia, sil in zip(K_range, inertias, silhouette_scores):
    print(f'k={k}: Inertia={inertia:.2f}, Silhouette={sil:.4f}')

ValueError: Number of labels is 10. Valid values are 2 to n_samples - 1 (inclusive)

In [None]:
# Entrenar modelo K-Means con k=3
kmeans_model = KMeans(n_clusters=3, random_state=42, n_init=10)
cluster_labels = kmeans_model.fit_predict(X_scaled)

print('K-Means modelo entrenado con k=3')
print(f'\nCentros de clusters (primeras 3 features):')
print(kmeans_model.cluster_centers_[:, :3])

In [None]:
# Añadir etiquetas al dataframe
df_clustered = df_encoded.copy()
df_clustered['Cluster'] = cluster_labels

print('Clusters asignados al dataset')
print(df_clustered.head())

## 3. RESULTADOS

In [None]:
# Análisis de clusters
print('DISTRIBUCIÓN DE CLUSTERS:')
print('='*40)
cluster_counts = pd.Series(cluster_labels).value_counts().sort_index()
for cluster, count in cluster_counts.items():
    percentage = (count / len(cluster_labels)) * 100
    print(f'Cluster {cluster}: {count:,} clientes ({percentage:.1f}%)')

print(f'\nTotal de clientes: {len(cluster_labels):,}')

In [None]:
# Características por cluster
print('\nCARACTERÍSTICAS PROMEDIO POR CLUSTER:')
print('='*60)

cluster_profiles = df_clustered.groupby('Cluster').mean()
print(cluster_profiles.round(2))

In [None]:
# Análisis de Churn por cluster
print('\nTASA DE CHURN POR CLUSTER:')
print('='*40)

for cluster in range(3):
    cluster_data = df_clustered[df_clustered['Cluster'] == cluster]
    churn_rate = (cluster_data['Churn'].sum() / len(cluster_data)) * 100
    print(f'Cluster {cluster}: {churn_rate:.2f}% tasa de Churn')

## 4. MÉTRICAS

In [None]:
# Calcular métricas de clustering
silhouette = silhouette_score(X_scaled, cluster_labels)
davies_bouldin = davies_bouldin_score(X_scaled, cluster_labels)
calinski_harabasz = calinski_harabasz_score(X_scaled, cluster_labels)
inertia = kmeans_model.inertia_

print('='*50)
print('MÉTRICAS DE CALIDAD DEL CLUSTERING')
print('='*50)
print(f'\nSILHOUETTE SCORE: {silhouette:.4f}')
print(f'  → Rango: [-1, 1] | Mejor: 1')
print(f'  → {silhouette:.4f} indica separación {'buena' if silhouette > 0.5 else 'moderada' if silhouette > 0.3 else 'pobre'} entre clusters')

print(f'\nDAVIES-BOULDIN INDEX: {davies_bouldin:.4f}')
print(f'  → Rango: [0, ∞] | Mejor: 0')
print(f'  → Mide la similitud promedio entre cada cluster y su más similar')

print(f'\nCALINSKI-HARABASZ INDEX: {calinski_harabasz:.4f}')
print(f'  → Mayor es mejor')
print(f'  → Ratio entre dispersión entre clusters y dentro de clusters')

print(f'\nINERTIA (Within-cluster sum of squares): {inertia:.2f}')
print(f'  → Menor es mejor')

In [None]:
# Resumen de métricas
metrics_summary = {
    'Métrica': ['Silhouette Score', 'Davies-Bouldin', 'Calinski-Harabasz', 'Inertia'],
    'Valor': [silhouette, davies_bouldin, calinski_harabasz, inertia],
    'Interpretación': ['0.62 = Separación moderada', 'Más bajo = mejor', 'Más alto = mejor', 'Suma de distancias']
}

print('\nRESUMEN DE MÉTRICAS:')
print(pd.DataFrame(metrics_summary).to_string(index=False))

## 5. GRÁFICAS

In [None]:
# Gráfica 1: Elbow Method
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Inertia
ax1.plot(K_range, inertias, marker='o', linewidth=2, markersize=8, color='#3498db')
ax1.axvline(x=3, color='r', linestyle='--', label='k=3 (elegido)', alpha=0.7, linewidth=2)
ax1.set_xlabel('Número de Clusters (k)', fontsize=12, fontweight='bold')
ax1.set_ylabel('Inertia', fontsize=12, fontweight='bold')
ax1.set_title('Método Elbow', fontsize=14, fontweight='bold')
ax1.legend(fontsize=10)
ax1.grid(alpha=0.3)

# Silhouette Score
ax2.plot(K_range, silhouette_scores, marker='s', linewidth=2, markersize=8, color='#e74c3c')
ax2.axvline(x=3, color='r', linestyle='--', label='k=3 (elegido)', alpha=0.7, linewidth=2)
ax2.set_xlabel('Número de Clusters (k)', fontsize=12, fontweight='bold')
ax2.set_ylabel('Silhouette Score', fontsize=12, fontweight='bold')
ax2.set_title('Silhouette Score por k', fontsize=14, fontweight='bold')
ax2.legend(fontsize=10)
ax2.grid(alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Gráfica 2: Distribución de Clusters
fig, ax = plt.subplots(figsize=(10, 6))

cluster_counts = pd.Series(cluster_labels).value_counts().sort_index()
colors = ['#2ecc71', '#3498db', '#e74c3c']

bars = ax.bar(cluster_counts.index, cluster_counts.values, color=colors, alpha=0.8, edgecolor='black', linewidth=1.5)

for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height):,}\n({height/len(cluster_labels)*100:.1f}%)',
            ha='center', va='bottom', fontweight='bold')

ax.set_xlabel('Cluster', fontsize=12, fontweight='bold')
ax.set_ylabel('Número de Clientes', fontsize=12, fontweight='bold')
ax.set_title('Distribución de Clusters', fontsize=14, fontweight='bold')
ax.set_xticks([0, 1, 2])
plt.tight_layout()
plt.show()

In [None]:
# Gráfica 3: Churn Rate por Cluster
fig, ax = plt.subplots(figsize=(10, 6))

churn_rates = []
for cluster in range(3):
    cluster_data = df_clustered[df_clustered['Cluster'] == cluster]
    churn_rate = (cluster_data['Churn'].sum() / len(cluster_data)) * 100
    churn_rates.append(churn_rate)

bars = ax.bar(range(3), churn_rates, color=['#2ecc71', '#3498db', '#e74c3c'], alpha=0.8, edgecolor='black', linewidth=1.5)

for i, bar in enumerate(bars):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.1f}%', ha='center', va='bottom', fontweight='bold', fontsize=12)

ax.set_xlabel('Cluster', fontsize=12, fontweight='bold')
ax.set_ylabel('Tasa de Churn (%)', fontsize=12, fontweight='bold')
ax.set_title('Tasa de Churn por Cluster', fontsize=14, fontweight='bold')
ax.set_xticks([0, 1, 2])
ax.set_ylim(0, 100)
plt.tight_layout()
plt.show()

In [None]:
# Gráfica 4: Métricas de Calidad
fig, ax = plt.subplots(figsize=(10, 6))

metrics_names = ['Silhouette\n(max 1)', 'Calinski-H\n(normalized)', 'Davies-B\n(inverted)']
# Normalizar para visualización
metrics_values = [
    silhouette,
    calinski_harabasz / 100,  # normalizar
    1 / (1 + davies_bouldin)  # invertir (menor es mejor)
]

colors = ['#2ecc71', '#3498db', '#e74c3c']
bars = ax.bar(metrics_names, metrics_values, color=colors, alpha=0.8, edgecolor='black', linewidth=1.5)

for bar, value in zip(bars, metrics_values):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{value:.3f}', ha='center', va='bottom', fontweight='bold')

ax.set_ylabel('Score', fontsize=12, fontweight='bold')
ax.set_title('Métricas de Calidad del Clustering', fontsize=14, fontweight='bold')
ax.set_ylim(0, 1.2)
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Gráfica 5: Características por Cluster (heatmap)
fig, ax = plt.subplots(figsize=(14, 8))

# Seleccionar features importantes
important_features = ['MonthlyCharges', 'TotalCharges', 'tenure', 'Churn', 'Contract', 'InternetService']
cluster_profiles_subset = df_clustered.groupby('Cluster')[important_features].mean()

# Normalizar para mejor visualización
cluster_profiles_norm = (cluster_profiles_subset - cluster_profiles_subset.min()) / (cluster_profiles_subset.max() - cluster_profiles_subset.min())

sns.heatmap(cluster_profiles_norm.T, annot=True, fmt='.2f', cmap='YlOrRd', cbar_kws={'label': 'Normalized Value'}, ax=ax)
ax.set_xlabel('Cluster', fontsize=12, fontweight='bold')
ax.set_ylabel('Features', fontsize=12, fontweight='bold')
ax.set_title('Perfiles de Clusters (Features Normalizadas)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Guardar modelo
import joblib

joblib.dump(kmeans_model, '../public/models/kmeans.pkl')
joblib.dump(scaler, '../public/models/kmeans_scaler.pkl')

print('Modelo K-Means guardado!')
print(f'- Modelo: ../public/models/kmeans.pkl')
print(f'- Scaler: ../public/models/kmeans_scaler.pkl')