<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [17]:
# 导入库
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import jieba.posseg as pseg


def jieba_cut(comment):
    seg_list = pseg.cut(comment)  # 精确模式分词[默认模式]
    # 只选择形容词追加到列表中
    word_list = [i.word for i in seg_list if i.flag in ['a', 'ag', 'an']]
    return word_list


# 读取数据文件
with open('./别让自己“墙”了自己.txt', encoding='utf-8') as fn:
    comment_list = fn.readlines()  # 读取文件内容为列表

stop_words = ['…', '。', '，', '？', '！', '+', ' ',
              '、', '：', '；', '（', '）', '.', '-']  # 定义停用词

vectorizer = TfidfVectorizer(stop_words=stop_words,
                             tokenizer=jieba_cut,
                             use_idf=True)  # 创建词向量模型

X = vectorizer.fit_transform(comment_list)  # 将评论关键字列表转换为词向量空间模型

# K均值聚类
model_kmeans = KMeans(n_clusters=3)  # 创建聚类模型对象
model_kmeans.fit(X)  # 训练模型

# 聚类结果汇总
cluster_labels = model_kmeans.labels_  # 聚类标签结果
word_vectors = vectorizer.get_feature_names()  # 词向量
word_values = X.toarray()  # 向量值
# 将向量值和标签值合并为新的矩阵
comment_matrix = np.hstack((word_values,
                            cluster_labels.reshape(word_values.shape[0], 1)))
word_vectors.append('cluster_labels')  # 将新的聚类标签列表追加到词向量后面
comment_pd = pd.DataFrame(comment_matrix,
                          columns=word_vectors)  # 创建包含词向量和聚类标签的数据框

# 聚类结果分析
# 选择聚类标签值为2的数据，并删除最后一列
comment_cluster1 = comment_pd[comment_pd['cluster_labels'] == 2].drop('cluster_labels', axis=1)
word_importance = np.sum(comment_cluster1, axis=0)  # 按照词向量做汇总统计

# 按汇总统计的值做逆序排序并打印输出前5个词
print(word_importance.sort_values(ascending=False).head())

大     1.668889
严重    0.466825
微软    0.423979
聪明    0.393579
高     0.000000
dtype: float64
