In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScalerclus

In [3]:
def calculate_wss_bss(csv_file_path):
    df = pd.read_csv(csv_file_path)

    # 提取特征列，忽略非数值列
    features = df[['sepallength', 'sepalwidth', 'petallength', 'petalwidth']]

    # 标准化特征列
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)

    # 提取聚类标签
    labels = df['Cluster']
    unique_labels, label_indices = np.unique(labels, return_inverse=True)

    # 初始化WSS和BSS
    wss = 0
    bss = 0

    # 计算每个簇的质心
    centroids = np.array([features_scaled[label_indices == i].mean(axis=0) for i in range(len(unique_labels))])

    # 计算WSS
    for i, centroid in enumerate(centroids):
        cluster_points = features_scaled[label_indices == i]
        wss += ((cluster_points - centroid)**2).sum()

    # 计算全局质心
    overall_centroid = features_scaled.mean(axis=0)

    # 计算BSS
    for i, centroid in enumerate(centroids):
        n_points = np.sum(label_indices == i)
        bss += n_points * np.linalg.norm(centroid - overall_centroid) ** 2

    return wss, bss

In [4]:
csv_files = [
    './csv_result-simpleKM.csv',
    './csv_result-hierarclust.csv',
    './csv_result-EM.csv'
]

# 计算每个文件的WSS和BSS
results = [calculate_wss_bss(file) for file in csv_files]

# 输出对比结果
for i, result in enumerate(results):
    print(f"File {i+1}: WSS = {result[0]}, BSS = {result[1]}")

File 1: WSS = 223.73200573676345, BSS = 376.2679942632363
File 2: WSS = 223.73200573676345, BSS = 376.2679942632363
File 3: WSS = 139.52116322464994, BSS = 460.4788367753499
