In [1]:
import torch
import torch.nn.functional as F
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

# 设置随机种子
torch.manual_seed(42)

# 生成 2D 数据集（3个簇）
X, _ = make_blobs(n_samples=300, centers=3, cluster_std=1.0, random_state=42)
X = torch.tensor(X, dtype=torch.float32)

# 可视化生成的数据
plt.scatter(X[:, 0], X[:, 1])
plt.title("Generated Data")
plt.show()


ModuleNotFoundError: No module named 'sklearn'

In [None]:
class GMM:
    def __init__(self, n_components, n_features):
        self.n_components = n_components  # 高斯分量数量
        self.n_features = n_features      # 每个样本的特征维度

        # 初始化参数（均值、协方差、混合系数）
        self.means = torch.randn(n_components, n_features, requires_grad=True)  # 均值
        self.covariances = torch.eye(n_features).repeat(n_components, 1, 1)  # 协方差矩阵
        self.weights = torch.ones(n_components) / n_components  # 混合系数

    def gaussian(self, X, mean, cov):
        """计算多变量高斯分布的概率密度函数"""
        det = torch.det(cov)
        inv_cov = torch.inverse(cov)
        norm_factor = torch.sqrt((2 * torch.pi) ** X.shape[1] * det)

        # 计算 (x - μ) * Σ⁻¹ * (x - μ).T
        diff = X - mean
        exponent = -0.5 * torch.sum(diff @ inv_cov * diff, dim=1)

        return torch.exp(exponent) / norm_factor

    def E_step(self, X):
        """E 步：计算每个样本属于各个高斯分量的概率（后验概率）"""
        responsibilities = []
        for k in range(self.n_components):
            prob = self.weights[k] * self.gaussian(X, self.means[k], self.covariances[k])
            responsibilities.append(prob.unsqueeze(1))

        responsibilities = torch.cat(responsibilities, dim=1)
        responsibilities = responsibilities / responsibilities.sum(dim=1, keepdim=True)
        return responsibilities

    def M_step(self, X, responsibilities):
        """M 步：更新均值、协方差和混合系数"""
        Nk = responsibilities.sum(dim=0)  # 每个分量的有效样本数

        # 更新均值
        self.means = (responsibilities.T @ X) / Nk.unsqueeze(1)

        # 更新协方差
        for k in range(self.n_components):
            diff = X - self.means[k]
            cov = (responsibilities[:, k].unsqueeze(1) * diff).T @ diff / Nk[k]
            self.covariances[k] = cov

        # 更新混合系数
        self.weights = Nk / X.shape[0]

    def fit(self, X, n_iters=100):
        """EM 训练过程"""
        for i in range(n_iters):
            # E步
            responsibilities = self.E_step(X)

            # M步
            self.M_step(X, responsibilities)

            # 打印每轮迭代的均值
            if (i + 1) % 10 == 0:
                print(f"Iteration {i+1}: Means = {self.means.detach().numpy()}")

    def predict(self, X):
        """预测每个样本的类别"""
        responsibilities = self.E_step(X)
        return torch.argmax(responsibilities, dim=1)


In [None]:
# 初始化 GMM 模型
n_components = 3  # 分量数量
n_features = 2    # 特征维度
gmm = GMM(n_components, n_features)

# 训练 GMM 模型
gmm.fit(X, n_iters=100)

# 预测每个样本的类别
labels = gmm.predict(X)

# 可视化聚类结果
plt.scatter(X[:, 0], X[:, 1], c=labels)
plt.title("GMM Clustering Result")
plt.show()
