In [4]:
import numpy as np

class KMeansScratch:
    def __init__(self, k=3, max_iters=100, tol=1e-4, random_state=None):
        """
        :param k: 指定聚类数量
        :param max_iters: 最大迭代次数
        :param tol: 判断质心变化是否收敛的阈值
        :param random_state: 随机种子，便于结果重现
        """
        self.k = k
        self.max_iters = max_iters
        self.tol = tol
        self.random_state = random_state
        
        # 训练后的属性
        self.centroids = None  # 质心
        self.labels_ = None    # 每个样本所属簇标签

    def fit(self, X):
        """
        训练 K-means 模型
        :param X: 输入数据，形状 (m, n) - m 条样本, n 个特征
        """
        # 1) 初始化质心
        np.random.seed(self.random_state)
        self._init_centroids(X)
        
        for i in range(self.max_iters):
            # 2) 将每个样本分配到最近的质心
            labels = self._assign_clusters(X)
            
            # 3) 计算新的质心
            new_centroids = self._update_centroids(X, labels)
            
            # 判断质心是否发生明显变化（小于 tol 则视为收敛）
            shift = np.linalg.norm(self.centroids - new_centroids)
            if shift < self.tol:
                # 收敛
                break
            
            self.centroids = new_centroids
        
        # 更新最终的 labels（可能最后一次更新后与原先略不同）
        self.labels_ = self._assign_clusters(X)

    def _init_centroids(self, X):
        """
        随机选择 k 个不同的样本点作为初始质心
        """
        m = X.shape[0]
        # 从 m 条样本里随机无放回地选 k 条
        indices = np.random.choice(m, self.k, replace=False)
        self.centroids = X[indices]  # 形状 (k, n)

    def _assign_clusters(self, X):
        """
        计算每个样本到 k 个质心的距离，返回分配的簇标签数组 (m,)
        """
        # 对于每个样本，计算它与所有质心的距离，然后取最小距离对应的索引
        distances = self._compute_distances(X, self.centroids)
        # distances 形状 (m, k), 行对应样本, 列对应质心
        # 每行找最小值对应的列索引 -> 该样本所属的簇
        labels = np.argmin(distances, axis=1)
        return labels

    def _update_centroids(self, X, labels):
        """
        根据当前分配结果重新计算质心
        对于每个簇，计算其所有样本在各个维度的平均值
        """
        new_centroids = []
        for cluster_id in range(self.k):
            # 取出属于 cluster_id 簇的所有样本
            cluster_points = X[labels == cluster_id]
            if len(cluster_points) == 0:
                # 如果有空簇，可随机或使用其它策略重新初始化
                # 这里简单地随机重新选一个质心
                new_centroid = self.centroids[cluster_id]
            else:
                # 各维度平均
                new_centroid = cluster_points.mean(axis=0)
            new_centroids.append(new_centroid)
        return np.array(new_centroids)

    def _compute_distances(self, X, centroids):
        """
        计算 X 到各个质心的欧几里得距离
        :param X: 形状 (m, n)
        :param centroids: 形状 (k, n)
        :return: 距离矩阵, 形状 (m, k)
        """
        # 做法1: 逐样本、逐质心循环计算
        # 做法2: 矩阵计算 (如下):
        # dist(x, c) = sqrt( sum( (x_j - c_j)^2 ) ), 
        # 这里可以用广播技巧、或 for 循环
        m = X.shape[0]
        k = centroids.shape[0]
        
        # distances[i, j] = ||X[i] - centroids[j]||
        distances = np.zeros((m, k))
        for i in range(k):
            # centroids[i] 形状 (n,)
            # X - centroids[i] 形状 (m, n)
            diff = X - centroids[i]
            distances[:, i] = np.sqrt(np.sum(diff**2, axis=1))
        return distances

    def predict(self, X):
        """
        对新数据 X 做聚类预测，返回标签 (与最近质心对应的簇)
        """
        distances = self._compute_distances(X, self.centroids)
        labels = np.argmin(distances, axis=1)
        return labels


In [5]:
if __name__ == "__main__":
    # 假设我们有一些 2D 数据
    X = np.array([
        [1.0, 2.0],
        [1.5, 1.8],
        [5.0, 8.0],
        [8.0, 8.0],
        [1.0, 0.6],
        [9.0, 11.0],
        [8.0, 9.0],
        [0.0, 3.0],
        [5.0, 4.0],
        [6.0, 4.0]
    ])

    # 创建并训练 KMeansScratch
    kmeans = KMeansScratch(k=2, max_iters=10, tol=1e-4, random_state=42)
    kmeans.fit(X)

    # 打印结果
    print("Final centroids:\n", kmeans.centroids)
    print("Labels:", kmeans.labels_)

    # 预测新点的聚类归属
    new_points = np.array([
        [0.5, 2.0],
        [7.0, 8.5]
    ])
    predictions = kmeans.predict(new_points)
    print("New points:", new_points)
    print("Predictions:", predictions)


Final centroids:
 [[6.83333333 7.33333333]
 [0.875      1.85      ]]
Labels: [1 1 0 0 1 0 0 1 0 0]
New points: [[0.5 2. ]
 [7.  8.5]]
Predictions: [1 0]
