In [None]:
import numpy as np
from sklearn.cluster import KMeans

class CluStream:
    def __init__(self, k, num_micro_clusters, decay_factor):
        self.k = k  # 聚类簇的数量
        self.num_micro_clusters = num_micro_clusters  # 微簇的数量
        self.decay_factor = decay_factor  # 衰减因子

        # 初始化聚类簇和微簇
        self.micro_clusters = []
        self.global_cluster = None

    def initialize_micro_clusters(self, data):
        # 随机选择一些数据点作为初始微簇
        indices = np.random.choice(len(data), self.num_micro_clusters, replace=False)
        self.micro_clusters = [{'center': data[i], 'count': 0, 'points': []} for i in indices]

    def decay_micro_clusters(self):
        for micro_cluster in self.micro_clusters:
            micro_cluster['count'] *= self.decay_factor

    def update_micro_clusters(self, data_point):
        # 计算数据点到每个微簇的距离，并选择最近的微簇
        distances = [np.linalg.norm(data_point - micro_cluster['center']) for micro_cluster in self.micro_clusters]
        nearest_cluster_index = np.argmin(distances)
        nearest_cluster = self.micro_clusters[nearest_cluster_index]

        # 更新最近微簇的属性
        nearest_cluster['center'] = (nearest_cluster['count'] * nearest_cluster['center'] + data_point) / (nearest_cluster['count'] + 1)
        nearest_cluster['count'] += 1
        nearest_cluster['points'].append(data_point)

    def merge_micro_clusters(self):
        if len(self.micro_clusters) <= self.k:
            return

        # 使用K-means算法将微簇合并为全局聚类簇
        data = np.array([micro_cluster['center'] for micro_cluster in self.micro_clusters])
        kmeans = KMeans(n_clusters=self.k)
        kmeans.fit(data)
        self.global_cluster = kmeans.cluster_centers_

    def update(self, data):
        # 初始化微簇
        if not self.micro_clusters:
            self.initialize_micro_clusters(data)

        for data_point in data:
            self.decay_micro_clusters()
            self.update_micro_clusters(data_point)
            self.merge_micro_clusters()

    def get_clusters(self):
        if self.global_cluster is not None:
            return self.global_cluster
        else:
            return [micro_cluster['center'] for micro_cluster in self.micro_clusters]
