# 第一题：使用sklearn的GaussianMixture完成聚类

实验内容：
1. 使用sklearn的GaussianMixture在Breast_Cancer_Wisconsin数据集上完成聚类任务
2. 对聚类结果可视化
3. 对比外部指标FMI和NMI
4. 选取数据集部分特征进行GaussianMixture聚类，然后对聚类结果进行可视化，并与全量特征的聚类结果进行对比分析

# 1. 导入模块

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# 2. 导入数据集

In [None]:
data = pd.read_csv('data\Breast_Cancer_Wisconsin\data')

In [None]:
data = data.values 
data_x = data[:,2:-1]
data_y = data[:,1:2]

In [None]:
print(data_x.shape)

# 3. 导入模型

导入高斯混合模型模型

In [None]:
from sklearn.mixture import GaussianMixture

# 4. 训练模型

In [None]:
gmm = GaussianMixture(n_components=2)
gmm.fit(data_x)

高斯混合模型需要使用predict函数预测类标记

In [None]:
y_hat = gmm.predict(data_x)

聚类结果统计

In [None]:
def getResult(data_y,y_hat):
    true_labels = data_y.reshape(-1)
    cluster = {}
    # 构造簇
    for i in range(len(y_hat)):
        cluster_label = y_hat[i]
        if cluster_label not in cluster:
            cluster[cluster_label] = {}
    # 构造簇内类别标签
    for cluster_label in cluster:
        for true_label in list(set(true_labels)):
            cluster[cluster_label][true_label] = 0
    # 添加簇内数据
    for i in range(len(y_hat)):
            cluster_label = y_hat[i]
            cluster[cluster_label][true_labels[i]] +=1
    # 按照簇序号排序
    cluster = dict(sorted(cluster.items(),key = lambda x:x[0]))
    return cluster
cluster = getResult(data_y,y_hat)
print(cluster)

# 5. 聚类结果可视化

In [None]:
def draw_bar(cluster_data):
    # 构造绘图数据
    y_data = {}
    for cluster_label in cluster_data:
        for true_label in cluster_data[cluster_label]:
            y_data[true_label] = []
        break
    for cluster_label in cluster_data:
        for true_label,num in cluster_data[cluster_label].items():
            y_data[true_label] +=[num]

    # 绘图
    bar_width = 0.35
    t = 0  # 偏移量
    for key,data in y_data.items():
        plt.bar(np.arange(len(data))+t,data,label = key,width = bar_width)
        t+=bar_width

    labels = ["cluster "+str(l) for l in cluster_data]
    plt.xticks(np.arange(len(data))+bar_width-0.2,labels)
    plt.title("Cluster result")
    plt.legend()
draw_bar(cluster)

从可视化效果可以看出，高斯混合聚类在该数据集上表现的比较好

## 6. 指标计算

我们这里选用两个外部指标，FMI和NMI。

In [None]:
from sklearn.metrics import normalized_mutual_info_score
from sklearn.metrics import fowlkes_mallows_score

In [None]:
normalized_mutual_info_score(data_y.reshape(-1), gmm.predict(data_x))

In [None]:
fowlkes_mallows_score(data_y.reshape(-1), gmm.predict(data_x))

# 7. 选取数据集部分特征进行GaussianMixture聚类：并对比聚类结果

In [None]:
data = pd.read_csv('data\Breast_Cancer_Wisconsin\data')

In [None]:
# YOUR CODE HERE
# 选取部分列特征，并构造数据集
selected_features = data.iloc[:, 2:12] # 取10个特征

In [None]:
# YOUR CODE HERE
# 模型定义，训练和预测
gmm_selected = GaussianMixture(n_components=2)
gmm_selected.fit(selected_features)
y_hat_selected = gmm_selected.predict(selected_features)

In [None]:
# YOUR CODE HERE
# 聚类结果可视化
cluster_selected = getResult(data_y, y_hat_selected)
print(cluster_selected)
draw_bar(cluster_selected)

In [None]:
# YOUR CODE HERE
# 计算评价指标FMI和NMI
print("NMI for selected features: ", normalized_mutual_info_score(data_y.reshape(-1), gmm_selected.predict(selected_features)))
print("FMI for selected features: ", fowlkes_mallows_score(data_y.reshape(-1), gmm_selected.predict(selected_features)))