1.导入数据集

In [1]:
from NLP import comment_analysis as ca
from gensim import corpora, models


doc = ca.prepare_news_to_corpora("news540")
doc_tokens = doc["doc_bodies"]
dictionary = corpora.Dictionary(doc_tokens) 

corpus = [dictionary.doc2bow(text) for text in doc_tokens]
tags = doc["tags"]

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Loading model cost 0.315 seconds.
DEBUG:jieba:Loading model cost 0.315 seconds.
Prefix dict has been built succesfully.
DEBUG:jieba:Prefix dict has been built succesfully.


2.导入需要的模型库

In [2]:
from time import time
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
#from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn import metrics

3.定义函数

In [4]:
def dist(x,y):
    return np.sqrt(np.sum((x-y)**2))

def lda_corpus(corpus, n_topics =50):
    tfidf = models.TfidfModel(corpus)
    tfidf_corpus = tfidf[corpus]    
    lda = models.LdaMulticore(tfidf_corpus, num_topics=n_topics, passes=50, iterations=50, id2word=dictionary)
    lda.save("result\lda_tmp")
    aa = lda[tfidf_corpus] 
    return aa

def load_lda_corpus(corpus, lda_corpus_doc, n_topics =50):
    tfidf = models.TfidfModel(corpus)
    tfidf_corpus = tfidf[corpus]
    lda = models.LdaMulticore.load(lda_corpus_doc)
    aa = lda[tfidf_corpus] 
    return aa    

def lda2list(corpus):
    #aa = lda_corpus(corpus)
    aa = load_lda_corpus(corpus, "result/lda_model_caicai_tfidf_50")
    s_l = []
    for ll in aa:
        s_l.append(ll)
    s_m = ca.list2SP(s_l)
    data = scale(s_m, with_mean=False) 
    return data

In [5]:
def kmeans_predict(corpus, n_clusters =10, n_pca =5):
    data = lda2list(corpus)
    reduced_data = PCA(n_components=n_pca).fit_transform(data.toarray())
    kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
    kmeans.fit(reduced_data)
    clus_pred = kmeans.predict(reduced_data)
    centroids = kmeans.cluster_centers_
    return [clus_pred,centroids]

def pred2dict(corpus):
    clus_pred = kmeans_predict(corpus)[0]
    k_s = {}
    for i, p in enumerate(clus_pred):
        if p in k_s:
            k_s[p].append(i)
        else:
            k_s[p] = []
            k_s[p].append(i)
    return k_s    


4.结果输出

In [13]:
def select_closest_corpus(corpus, n_pca =5):
    centroids = kmeans_predict(corpus)[1]
    k_s = pred2dict(corpus)
    data = lda2list(corpus)
    reduced_data = PCA(n_components=n_pca).fit_transform(data.toarray())
    result = []
    for i , cent in enumerate(centroids):
        ll = k_s[i]
        dis = []
        ind = []
        for index in ll:
            dis_index = dist(cent,reduced_data[index])
            dis.append(dis_index)
            ind.append(index)
        dis_min = np.min(dis)
        index_min = ll[dis.index(dis_min)]
        result.append([i,index_min,ll])
    
    return result
    
print select_closest_corpus(corpus)

[[0, 445, [1, 9, 11, 12, 14, 22, 29, 32, 36, 52, 58, 68, 69, 71, 72, 81, 101, 102, 105, 128, 132, 136, 140, 162, 173, 185, 186, 187, 193, 197, 199, 200, 208, 216, 223, 224, 225, 229, 233, 234, 240, 242, 245, 250, 263, 264, 266, 269, 270, 271, 272, 274, 275, 277, 278, 279, 281, 284, 286, 289, 294, 298, 305, 312, 314, 315, 316, 318, 326, 334, 338, 344, 346, 347, 352, 364, 369, 371, 372, 376, 382, 386, 389, 391, 404, 405, 407, 411, 419, 420, 424, 426, 428, 430, 433, 437, 442, 445, 447, 450, 453, 459, 467, 469, 478, 479, 480, 488, 490, 496, 502, 503, 509, 514, 516, 517, 521, 523, 526, 532, 534, 535, 539]], [1, 356, [16, 44, 50, 59, 67, 70, 80, 91, 93, 152, 195, 198, 214, 273, 325, 330, 332, 335, 339, 345, 354, 356, 368, 392, 406, 409, 508, 527]], [2, 65, [0, 5, 6, 10, 33, 39, 53, 55, 56, 57, 61, 62, 65, 73, 100, 108, 113, 116, 119, 120, 123, 124, 125, 126, 127, 135, 141, 144, 147, 168, 172, 178, 181, 184, 227, 237, 243, 248, 251, 260, 268, 300, 302, 327, 328, 337, 357, 358, 359, 370, 380, 

In [10]:
def write_kmeans_result_doc(corpus,result_doc):
    
    f = open("news540")
    news = []

    for line in f:
        news.append(line)

    f.close()

    g = open(result_doc,"w")
    
    k_s = pred2dict(corpus)
        
    for k in k_s:
        k_l = k_s[k]
        for ll in k_l:
            g.write(("%s||%s\n") % (news[ll], k))
    
        g.write(("----End of the %d cluster---------------------------------------------\n") % k)
        g.write(("----------------------------------------------------------------------\n") % k)

    g.flush()
    g.close() 
    
write_kmeans_result_doc(corpus,"result\test")