# K-Means Clustering - Análisis de Utilities

Análisis de clustering usando K-Means en el dataset de Utilities.


## Importar librerías


In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import pairwise
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.cluster import KMeans
import matplotlib.pylab as plt
import seaborn as sns
from pandas.plotting import parallel_coordinates


## Cargar y preprocesar datos


In [18]:
# Load and preprocess data
utilities_df = pd.read_csv("Utilities.csv")
# utilities_df.dtypes
# utilities_df.shape
# utilities_df.head()

In [19]:
# SE DEBE EJECTUAR TODO DE NUEVO PARA EVITAR ERRORES DE SOBRE ESCRITURA
# Limpiar espacios en blanco de los nombres de columnas
utilities_df.columns = utilities_df.columns.str.strip()
# Verificar las columnas disponibles
print("\nPrimeras filas:")
print(utilities_df.head())

# Establecer indice "Company"
utilities_df.set_index("Company", inplace=True)
print("\n✓ Índice establecido correctamente con 'Company'")

# Convertir todas las columnas a float64
utilities_df = utilities_df.apply(lambda x: x.astype("float64"))
print("✓ Columnas convertidas a float64")

# Normalize distances
utilities_df_norm = utilities_df.apply(preprocessing.scale,axis=0)
print("✓ Datos normalizados")
print("\nDatos normalizados (primeras filas):")
print(utilities_df_norm.head())
print("\nTipos de datos normalizados:")
print(utilities_df_norm.dtypes)


Primeras filas:
        Company  Fixed_charge   RoR  Cost  Load_factor  Demand_growth  Sales  \
0      Arizona           1.06   9.2   151         54.4            1.6   9077   
1       Boston           0.89  10.3   202         57.9            2.2   5088   
2      Central           1.43  15.4   113         53.0            3.4   9212   
3  Commonwealth          1.02  11.2   168         56.0            0.3   6423   
4            NY          1.49   8.8   192         51.2            1.0   3300   

   Nuclear  Fuel_Cost  
0      0.0      0.628  
1     25.3      1.555  
2      0.0      1.058  
3     34.3      0.700  
4     15.6      2.044  

✓ Índice establecido correctamente con 'Company'
✓ Columnas convertidas a float64
✓ Datos normalizados

Datos normalizados (primeras filas):
              Fixed_charge       RoR      Cost  Load_factor  Demand_growth  \
Company                                                                      
Arizona          -0.300057 -0.700750 -0.426938    -0.591310 

## Aplicar K-Means


In [20]:
kmeans = KMeans(n_clusters=6, random_state=0).fit(utilities_df_norm)
# Cluster membership
memb = pd.Series(kmeans.labels_, index=utilities_df_norm.index)
for key, item in memb.groupby(memb):
	print(key, ": ", ", ".join(item.index))


0 :  Commonwealth, Madison , Northern, Wisconsin, Virginia
1 :  Pacific 
2 :  Arizona , Central , Florida , Oklahoma, Southern, Texas
3 :  Idaho, Nevada, Puget
4 :  Kentucky
5 :  Boston , NY, Hawaiian , New England, San Diego, United


## Centroides


In [21]:
# centroids
centroids = pd.DataFrame(kmeans.cluster_centers_,columns=utilities_df_norm.columns)
#pd.set_option("precision", 3)
centroids


Unnamed: 0,Fixed_charge,RoR,Cost,Load_factor,Demand_growth,Sales,Nuclear,Fuel_Cost
0,-0.011599,0.33918,0.224086,-0.366466,0.170386,-0.411331,1.601868,-0.60946
1,-0.854783,-1.430526,-0.103911,1.198264,-1.096618,-0.705245,-0.676588,0.547149
2,0.393352,0.758801,-1.176527,-0.533952,-0.801203,0.444577,-0.502869,-0.416385
3,-0.614402,-0.852787,1.370418,-0.49189,1.015056,1.90021,-0.731447,-0.988493
4,1.253178,1.032466,-0.004518,0.785285,1.29953,-0.14648,-0.731447,-0.443073
5,-0.142884,-0.548714,0.322651,0.754694,0.117868,-0.909952,-0.231625,1.401169


## Análisis de distancias dentro de los clusters


In [22]:
# Within-cluster sum of squared distances and cluster count
# calculate the distances of each data point to the cluster centers
distances = kmeans.transform(utilities_df_norm)
pd.DataFrame(distances)
# find closest cluster for each data point
minSquaredDistances = distances.min(axis=1) ** 2
# combine with cluster labels into a data frame
df = pd.DataFrame({'squaredDistance': minSquaredDistances,'cluster': kmeans.labels_},index=utilities_df_norm.index)
# group by cluster and print information
for cluster, data in df.groupby("cluster"):
	count = len(data)
	withinClustSS = data.squaredDistance.sum()
	print(f'Cluster', cluster, '(',count,'members)', 'squaredDistanceSum:',withinClustSS, 'within', cluster)


Cluster 0 ( 5 members) squaredDistanceSum: 10.661717000326373 within 0
Cluster 1 ( 1 members) squaredDistanceSum: 1.776356839400251e-15 within 1
Cluster 2 ( 6 members) squaredDistanceSum: 20.275555946517628 within 2
Cluster 3 ( 3 members) squaredDistanceSum: 9.987498917250377 within 3
Cluster 4 ( 1 members) squaredDistanceSum: 0.0 within 4
Cluster 5 ( 6 members) squaredDistanceSum: 32.27114263082476 within 5


## Distancias euclidianas entre centroides


In [23]:
# Euclidean Distance between Cluster centroids
pd.DataFrame(pairwise.pairwise_distances(kmeans.cluster_centers_,metric='euclidean'))


Unnamed: 0,0,1,2,3,4,5
0,0.0,3.823555,2.910944,3.841702,3.205659,3.118976
1,3.823555,0.0,3.587479,4.362407,4.21041,2.026654
2,2.910944,3.587479,0.0,3.987317,2.956953,3.458584
3,3.841702,4.362407,3.987317,0.0,3.889859,4.198671
4,3.205659,4.21041,2.956953,3.889859,0.0,3.191765
5,3.118976,2.026654,3.458584,4.198671,3.191765,0.0


## Visualización: Perfil de centroides


In [None]:
# code for plotting profile plot of centroids
centroids['cluster'] = [format(i) for i in centroids.index]
plt.figure(figsize=(10,6))
parallel_coordinates(centroids, class_column='cluster',colormap='Dark2', linewidth=5)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()


## Análisis del número óptimo de clusters


In [None]:
# code for average withincluster distance vs number of cluster
inertia = []
for n_clusters in range(1, 7):
	kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(utilities_df_norm)
	inertia.append(kmeans.inertia_ / n_clusters)
inertias = pd.DataFrame({'n_clusters': range(1, 7), 'inertia':inertia})
ax = inertias.plot(x='n_clusters', y='inertia')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Average Within-Cluster Squared Distances')
plt.ylim((0, 1.1 * inertias.inertia.max()))
ax.legend().set_visible(False)
plt.show()
