# K-Means Clustering - Análisis de Utilities

Análisis de clustering usando K-Means en el dataset de Utilities.


## Importar librerías


In [2]:
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import pairwise
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.cluster import KMeans
import matplotlib.pylab as plt
import seaborn as sns
from pandas.plotting import parallel_coordinates


## Cargar y preprocesar datos


In [12]:
# Load and preprocess data
utilities_df = pd.read_csv("Utilities.csv")
# Limpiar espacios en blanco de los nombres de columnas
utilities_df.columns = utilities_df.columns.str.strip()
# Verificar las columnas disponibles
print("Columnas disponibles:", utilities_df.columns.tolist())
print("\nPrimeras filas:")
print(utilities_df.head())

# Establecer Company como índice
if "Company" in utilities_df.columns:
    utilities_df.set_index("Company", inplace=True)
    print("\n✓ Índice establecido correctamente con 'Company'")
else:
    # Si no existe, usar la primera columna
    first_col = utilities_df.columns[0]
    print(f"\n⚠ 'Company' no encontrada. Usando '{first_col}' como índice")
    utilities_df.set_index(first_col, inplace=True)

# Convertir todas las columnas a float64
utilities_df = utilities_df.apply(lambda x: x.astype("float64"))
print("✓ Columnas convertidas a float64")

# Normalize distances
utilities_df_norm = utilities_df.apply(preprocessing.scale,axis=0)
print("✓ Datos normalizados")
print("\nDatos normalizados (primeras filas):")
print(utilities_df_norm.head())
print("\nTipos de datos normalizados:")
print(utilities_df_norm.dtypes)

Columnas disponibles: ['Company', 'Fixed_charge', 'RoR', 'Cost', 'Load_factor', 'Demand_growth', 'Sales', 'Nuclear', 'Fuel_Cost']

Primeras filas:
        Company  Fixed_charge   RoR  Cost  Load_factor  Demand_growth  Sales  \
0      Arizona           1.06   9.2   151         54.4            1.6   9077   
1       Boston           0.89  10.3   202         57.9            2.2   5088   
2      Central           1.43  15.4   113         53.0            3.4   9212   
3  Commonwealth          1.02  11.2   168         56.0            0.3   6423   
4            NY          1.49   8.8   192         51.2            1.0   3300   

   Nuclear  Fuel_Cost  
0      0.0      0.628  
1     25.3      1.555  
2      0.0      1.058  
3     34.3      0.700  
4     15.6      2.044  

✓ Índice establecido correctamente con 'Company'
✓ Columnas convertidas a float64
✓ Datos normalizados

Datos normalizados (primeras filas):
              Fixed_charge       RoR      Cost  Load_factor  Demand_growth  \
Company 

In [13]:
# Esta celda ya no es necesaria - todo el procesamiento está en la celda anterior
# Si necesitas verificar el estado de los datos, ejecuta:
# print(utilities_df_norm.head())

## Aplicar K-Means


In [None]:
kmeans = KMeans(n_clusters=6, random_state=0).fit(utilities_df_norm)
# Cluster membership
memb = pd.Series(kmeans.labels_, index=utilities_df_norm.index)
for key, item in memb.groupby(memb):
	print(key, ": ", ", ".join(item.index))


## Centroides


In [None]:
# centroids
centroids = pd.DataFrame(kmeans.cluster_centers_,columns=utilities_df_norm.columns)
#pd.set_option("precision", 3)
centroids


## Análisis de distancias dentro de los clusters


In [None]:
# Within-cluster sum of squared distances and cluster count
# calculate the distances of each data point to the cluster centers
distances = kmeans.transform(utilities_df_norm)
pd.DataFrame(distances)
# find closest cluster for each data point
minSquaredDistances = distances.min(axis=1) ** 2
# combine with cluster labels into a data frame
df = pd.DataFrame({'squaredDistance': minSquaredDistances,'cluster': kmeans.labels_},index=utilities_df_norm.index)
# group by cluster and print information
for cluster, data in df.groupby("cluster"):
	count = len(data)
	withinClustSS = data.squaredDistance.sum()
	print(f'Cluster', cluster, '(',count,'members)', 'squaredDistanceSum:',withinClustSS, 'within', cluster)


## Distancias euclidianas entre centroides


In [None]:
# Euclidean Distance between Cluster centroids
pd.DataFrame(pairwise.pairwise_distances(kmeans.cluster_centers_,metric='euclidean'))


## Visualización: Perfil de centroides


In [None]:
# code for plotting profile plot of centroids
centroids['cluster'] = [format(i) for i in centroids.index]
plt.figure(figsize=(10,6))
parallel_coordinates(centroids, class_column='cluster',colormap='Dark2', linewidth=5)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()


## Análisis del número óptimo de clusters


In [None]:
# code for average withincluster distance vs number of cluster
inertia = []
for n_clusters in range(1, 7):
	kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(utilities_df_norm)
	inertia.append(kmeans.inertia_ / n_clusters)
inertias = pd.DataFrame({'n_clusters': range(1, 7), 'inertia':inertia})
ax = inertias.plot(x='n_clusters', y='inertia')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Average Within-Cluster Squared Distances')
plt.ylim((0, 1.1 * inertias.inertia.max()))
ax.legend().set_visible(False)
plt.show()
