# 9.6 文本聚类实战

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [4]:
book_data = pd.read_csv('data/data.csv') #读取文件
print(book_data.head())
book_titles = book_data['title'].tolist()
book_content = book_data['content'].tolist()
print('书名:', book_titles[0])
print('内容:', book_content[0][:10])
from normalization import normalize_corpus
# normalize corpus
norm_book_content = normalize_corpus(book_content)

    title         tag                                              info  \
0  ﻿解忧杂货店  豆瓣图书标签: 小说         [日] 东野圭吾 / 李盈春 / 南海出版公司 / 2014-5 / 39.50元   
1   巨人的陨落  豆瓣图书标签: 小说  [英] 肯·福莱特 / 于大卫 / 江苏凤凰文艺出版社 / 2016-5-1 / 129.80元   
2   我的前半生  豆瓣图书标签: 小说                     亦舒 / 新世界出版社 / 2007-8 / 22.00元   
3    百年孤独  豆瓣图书标签: 小说   [哥伦比亚] 加西亚·马尔克斯 / 范晔 / 南海出版公司 / 2011-6 / 39.50元   
4   追风筝的人  豆瓣图书标签: 小说     [美] 卡勒德·胡赛尼 / 李继宏 / 上海人民出版社 / 2006-5 / 29.00元   

      comments                                            content  
0  (225675人评价)  现代人内心流失的东西，这家杂货店能帮你找回——\r\n僻静的街道旁有一家杂货店，只要写下烦恼...  
1   (22536人评价)  在第一次世界大战的硝烟中，每一个迈向死亡的生命都在热烈地生长——威尔士的矿工少年、刚失恋的美...  
2   (20641人评价)  一个三十几岁的美丽女人子君，在家做全职家庭主妇。却被一个平凡女人夺走丈夫，一段婚姻的失败，让...  
3  (111883人评价)  《百年孤独》是魔幻现实主义文学的代表作，描写了布恩迪亚家族七代人的传奇故事，以及加勒比海沿岸...  
4  (278905人评价)  12岁的阿富汗富家少爷阿米尔与仆人哈桑情同手足。然而，在一场风筝比赛后，发生了一件悲惨不堪的...  
书名: ﻿解忧杂货店
内容: 现代人内心流失的东西


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 1.521 seconds.
Prefix dict has been built succesfully.


#### 特征提取（TFIDF）

In [5]:
def build_feature_matrix(documents, feature_type='frequency',
                         ngram_range=(1, 1), min_df=0.0, max_df=1.0):
    feature_type = feature_type.lower().strip()

    if feature_type == 'binary':
        vectorizer = CountVectorizer(binary=True,
                                     max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'frequency':
        vectorizer = CountVectorizer(binary=False, min_df=min_df,
                                     max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer()
    else:
        raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")

    feature_matrix = vectorizer.fit_transform(documents).astype(float)

    return vectorizer, feature_matrix


In [6]:
vectorizer, feature_matrix = build_feature_matrix(norm_book_content,
                                                  feature_type='tfidf',
                                                  min_df=0.2, max_df=0.90,
                                                  ngram_range=(1, 2))

In [7]:
# 查看特征数量
print(feature_matrix.shape)

(5645, 21038)


In [7]:
# 获取特征名字
feature_names = vectorizer.get_feature_names()
# 打印某些特征
print(feature_names[:10])

['000', '001', '002', '004', '01', '025', '027', '037', '07', '08']


## K means  聚类

- 此处设置 k = 10

In [8]:
from sklearn.cluster import KMeans
def k_means(feature_matrix, num_clusters=10):
    km = KMeans(n_clusters=num_clusters,
                max_iter=10000)
    km.fit(feature_matrix)
    clusters = km.labels_
    return km, clusters


num_clusters = 10
km_obj, clusters = k_means(feature_matrix=feature_matrix,
                           num_clusters=num_clusters)
book_data['Cluster'] = clusters

#### 打印每个cluster的书籍

In [9]:
from collections import Counter
c = Counter(clusters)
print(c.items())


dict_items([(6, 347), (1, 865), (0, 486), (5, 177), (4, 2715), (3, 229), (2, 480), (9, 115), (7, 51), (8, 180)])


In [10]:
def get_cluster_data(clustering_obj, book_data,
                     feature_names, num_clusters,
                     topn_features=10):
    cluster_details = {}
    # 获取cluster的center
    ordered_centroids = clustering_obj.cluster_centers_.argsort()[:, ::-1]
    # 获取每个cluster的关键特征
    # 获取每个cluster的书
    for cluster_num in range(num_clusters):
        cluster_details[cluster_num] = {}
        cluster_details[cluster_num]['cluster_num'] = cluster_num
        key_features = [feature_names[index]
                        for index
                        in ordered_centroids[cluster_num, :topn_features]]
        cluster_details[cluster_num]['key_features'] = key_features

        books = book_data[book_data['Cluster'] == cluster_num]['title'].values.tolist()
        cluster_details[cluster_num]['books'] = books

    return cluster_details


def print_cluster_data(cluster_data):
    # print cluster details
    for cluster_num, cluster_details in cluster_data.items():
        print('Cluster {} details:'.format(cluster_num))
        print('-' * 20)
        print('Key features:', cluster_details['key_features'])
        print('book in this cluster:')
        print(', '.join(cluster_details['books']))
        print('=' * 40)

## 绘图
def plot_clusters(num_clusters, feature_matrix,
                  cluster_data, book_data,
                  plot_size=(16, 8)):
    # generate random color for clusters
    def generate_random_color():
        color = '#%06x' % random.randint(0, 0xFFFFFF)
        return color

    # define markers for clusters
    markers = ['o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd']
    # build cosine distance matrix
    cosine_distance = 1 - cosine_similarity(feature_matrix)
    # dimensionality reduction using MDS
    mds = MDS(n_components=2, dissimilarity="precomputed",
              random_state=1)
    # get coordinates of clusters in new low-dimensional space
    plot_positions = mds.fit_transform(cosine_distance)
    x_pos, y_pos = plot_positions[:, 0], plot_positions[:, 1]
    # build cluster plotting data
    cluster_color_map = {}
    cluster_name_map = {}
    for cluster_num, cluster_details in cluster_data[0:500].items():
        # assign cluster features to unique label
        cluster_color_map[cluster_num] = generate_random_color()
        cluster_name_map[cluster_num] = ', '.join(cluster_details['key_features'][:5]).strip()
    # map each unique cluster label with its coordinates and books
    cluster_plot_frame = pd.DataFrame({'x': x_pos,
                                       'y': y_pos,
                                       'label': book_data['Cluster'].values.tolist(),
                                       'title': book_data['title'].values.tolist()
                                       })
    grouped_plot_frame = cluster_plot_frame.groupby('label')
    # set plot figure size and axes
    fig, ax = plt.subplots(figsize=plot_size)
    ax.margins(0.05)
    # plot each cluster using co-ordinates and book titles
    for cluster_num, cluster_frame in grouped_plot_frame:
        marker = markers[cluster_num] if cluster_num < len(markers) \
            else np.random.choice(markers, size=1)[0]
        ax.plot(cluster_frame['x'], cluster_frame['y'],
                marker=marker, linestyle='', ms=12,
                label=cluster_name_map[cluster_num],
                color=cluster_color_map[cluster_num], mec='none')
        ax.set_aspect('auto')
        ax.tick_params(axis='x', which='both', bottom='off', top='off',
                       labelbottom='off')
        ax.tick_params(axis='y', which='both', left='off', top='off',
                       labelleft='off')
    fontP = FontProperties()
    fontP.set_size('small')
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.01), fancybox=True,
              shadow=True, ncol=5, numpoints=1, prop=fontP)
    # add labels as the film titles
    for index in range(len(cluster_plot_frame)):
        ax.text(cluster_plot_frame.ix[index]['x'],
                cluster_plot_frame.ix[index]['y'],
                cluster_plot_frame.ix[index]['title'], size=8)
        # show the plot
    plt.show()


In [11]:
cluster_data = get_cluster_data(clustering_obj=km_obj,
                                book_data=book_data,
                                feature_names=feature_names,
                                num_clusters=num_clusters,
                                topn_features=5)

print_cluster_data(cluster_data)

Cluster 0 details:
--------------------
Key features: ['没有', '女人', '一个', '为什么', '开始']
book in this cluster:
我的前半生, 新名字的故事, 月亮和六便士, 杀死一只知更鸟, 斯通纳, 新名字的故事, 月亮和六便士, 杀死一只知更鸟, 斯通纳, 我的天才女友, 月亮和六便士, 杀死一只知更鸟, 江城, 爱你就像爱生命, 杀死一只知更鸟, 白夜行, 强风吹拂, 咖啡未冷前, 国境以南 太阳以西, 二十首情诗与绝望的歌, 爱德华的奇妙之旅, 爱你就像爱生命, 我的精神家园, 似水流年, 我的精神家园, 爱德华的奇妙之旅, 《噼里啪啦系列》, 月亮和六便士, 没有一条道路是重复的, 我没有自己的名字, 红玫瑰与白玫瑰, 一个陌生女人的来信, 鲁迅杂文全集, 一个陌生女人的来信, 恐惧, 茨威格小说集, 一个陌生女人的来信, 慢, 庆祝无意义, 身份, ﻿白夜行, 嫌疑人X的献身, 半落, 放学后, 1995-2005夏至未至, 坏一坏, 致我们终将逝去的青春, 微微一笑很倾城, 白夜行, 嫌疑人X的献身, 放学后, 祈祷落幕时, ﻿三体, 银河系漫游指南, 神们自己, 球状闪电, 那个不为人知的故事, 微微一笑很倾城, 他知道风从哪个方向来, 曾有一个人，爱我如生命, 海棠依舊: 知否? 知否? 應是綠肥紅瘦 卷一, ﻿白夜行, 嫌疑人X的献身, ﻿散步去, 他的国, 又一春, 她比烟花寂寞, 开到荼蘼, 人淡如菊, 朝花夕拾, 乐未央, 不羁的风, 阿修罗, ﻿白夜行, 嫌疑人X的献身, 半落, 放学后, 一朵桔梗花, 我的宝贝, 倾城, 明朝那些事儿（壹）, 那个不为人知的故事, 致我们终将逝去的青春, 海棠依舊: 知否? 知否? 應是綠肥紅瘦 卷一, ﻿1995-2005夏至未至, 夏至未至, 海棠依舊: 知否? 知否? 應是綠肥紅瘦 卷一, 步步惊心, 凤囚凰（上中）, 绾青丝Ⅰ, 渴望死亡的小丑, 尼罗河上的惨案, 罗杰疑案, 阿加莎·克里斯蒂侦探推理“波洛”系列（全32册）, 阳光下的罪恶, 无尽长夜, 我的心中每天开出一朵花, 微笑的鱼, ﻿三体, 银河系漫游指南, 神们自己, 球状闪电, 仿生人会梦见电子羊吗？, 我偏爱那些不切实际的浪漫, 愿你迷路到我

In [12]:
import matplotlib.pyplot as plt
from sklearn.manifold import MDS
from sklearn.metrics.pairwise import cosine_similarity
import random
from matplotlib.font_manager import FontProperties


In [13]:
plot_clusters(num_clusters=num_clusters,
              feature_matrix=feature_matrix,
              cluster_data=cluster_data,
              book_data=book_data,
              plot_size=(16, 8))

KeyboardInterrupt: 

In [None]:
## 这些不知道是啥

In [None]:
from sklearn.cluster import AffinityPropagation
#相似性传播

def affinity_propagation(feature_matrix):
    sim = feature_matrix * feature_matrix.T
    sim = sim.todense()
    ap = AffinityPropagation()
    ap.fit(sim)
    clusters = ap.labels_
    return ap, clusters


# get clusters using affinity propagation
ap_obj, clusters = affinity_propagation(feature_matrix=feature_matrix)
book_data['Cluster'] = clusters

# get the total number of books per cluster
c = Counter(clusters)
print(c.items())

# get total clusters
total_clusters = len(c)
print('Total Clusters:', total_clusters)

cluster_data = get_cluster_data(clustering_obj=ap_obj,
                                book_data=book_data,
                                feature_names=feature_names,
                                num_clusters=total_clusters,
                                topn_features=5)

print_cluster_data(cluster_data)

plot_clusters(num_clusters=num_clusters,
              feature_matrix=feature_matrix,
              cluster_data=cluster_data,
              book_data=book_data,
              plot_size=(16, 8))

from scipy.cluster.hierarchy import ward, dendrogram


def ward_hierarchical_clustering(feature_matrix):
    cosine_distance = 1 - cosine_similarity(feature_matrix)
    linkage_matrix = ward(cosine_distance)
    return linkage_matrix


def plot_hierarchical_clusters(linkage_matrix, book_data, figure_size=(8, 12)):
    # set size
    fig, ax = plt.subplots(figsize=figure_size)
    book_titles = book_data['title'].values.tolist()
    # plot dendrogram
    ax = dendrogram(linkage_matrix, orientation="left", labels=book_titles)
    plt.tick_params(axis='x',
                    which='both',
                    bottom='off',
                    top='off',
                    labelbottom='off')
    plt.tight_layout()
    plt.savefig('ward_hierachical_clusters.png', dpi=200)


# build ward's linkage matrix
linkage_matrix = ward_hierarchical_clustering(feature_matrix)
# plot the dendrogram
plot_hierarchical_clusters(linkage_matrix=linkage_matrix,
                           book_data=book_data,
                           figure_size=(8, 10))
