In [86]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [81]:

corpus = [
    'Pumas are large, cat-like animals which are found in America. When reports came into London Zoo that a wild puma had been spotted forty-five miles south of London, they were not taken seriously.',
    'However, as the evidence began to accumulate, experts from the Zoo felt obliged to investigate, for the descriptions given by people who claimed to have seen the puma were extraordinarily similar.',
    'The hunt for the puma began in a small village where a woman picking blackberries saw a large cat only five yards away from her. It immediately ran away when she saw it, and experts confirmed that a puma will not attack a human being unless it is cornered.',
    'The search proved difficult, for the puma was often observed at one place in the morning and at another place twenty miles away in the evening. Wherever it went, it left behind it a trail of dead deer and small animals like rabbits.',
    'Paw prints were seen in a number of places and puma fur was found clinging to bushes. Several people complained of "cat-like noises at night and a businessman on a fishing trip saw the puma up a tree.',
    'The experts were now fully convinced that the animal was a puma, but where had it come from? As no pumas had been reported missing from any zoo in the country, this one must have been in the possession of a private collector and somehow managed to escape.',
    'The hunt went on for several weeks, but the puma was not caught. It is disturbing to think that a dangerous wild animal is still at large in the quiet countryside.'
]

In [82]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
tfidf_vec = TfidfVectorizer() 
tfidf_matrix = tfidf_vec.fit_transform(corpus)

In [83]:
print(tfidf_vec.get_feature_names())
print(tfidf_vec.vocabulary_)

['accumulate', 'america', 'and', 'animal', 'animals', 'another', 'any', 'are', 'as', 'at', 'attack', 'away', 'been', 'began', 'behind', 'being', 'blackberries', 'bushes', 'businessman', 'but', 'by', 'came', 'cat', 'caught', 'claimed', 'clinging', 'collector', 'come', 'complained', 'confirmed', 'convinced', 'cornered', 'country', 'countryside', 'dangerous', 'dead', 'deer', 'descriptions', 'difficult', 'disturbing', 'escape', 'evening', 'evidence', 'experts', 'extraordinarily', 'felt', 'fishing', 'five', 'for', 'forty', 'found', 'from', 'fully', 'fur', 'given', 'had', 'have', 'her', 'however', 'human', 'hunt', 'immediately', 'in', 'into', 'investigate', 'is', 'it', 'large', 'left', 'like', 'london', 'managed', 'miles', 'missing', 'morning', 'must', 'night', 'no', 'noises', 'not', 'now', 'number', 'obliged', 'observed', 'of', 'often', 'on', 'one', 'only', 'paw', 'people', 'picking', 'place', 'places', 'possession', 'prints', 'private', 'proved', 'puma', 'pumas', 'quiet', 'rabbits', 'ran',

In [90]:
print(tfidf_matrix.toarray())

[[0.         0.18487403 0.         ... 0.         0.         0.13117365]
 [0.19643497 0.         0.         ... 0.         0.         0.13937649]
 [0.         0.         0.1023333  ... 0.16612026 0.16612026 0.        ]
 ...
 [0.         0.         0.24175029 ... 0.         0.         0.        ]
 [0.         0.         0.10032698 ... 0.         0.         0.11555641]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [91]:
corpus[1]

'However, as the evidence began to accumulate, experts from the Zoo felt obliged to investigate, for the descriptions given by people who claimed to have seen the puma were extraordinarily similar.'

In [92]:
from textrank4zh import TextRank4Keyword, TextRank4Sentence 
text = corpus[1]
tr4w = TextRank4Keyword()
tr4w.analyze(text=text, lower=True, window=2)
print( '关键词：' )
for item in tr4w.get_keywords(10, word_min_len=1):
    print("{} 出现的频率为:{:.6f}".format(item.word, item.weight)) 

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\DELL\AppData\Local\Temp\jieba.cache
Loading model cost 0.679 seconds.
Prefix dict has been built successfully.


关键词：
began 出现的频率为:0.082775
extraordinarily 出现的频率为:0.082775
accumulate 出现的频率为:0.077769
puma 出现的频率为:0.077769
experts 出现的频率为:0.074998
claimed 出现的频率为:0.074998
zoo 出现的频率为:0.073488
people 出现的频率为:0.073488
felt 出现的频率为:0.072704
descriptions 出现的频率为:0.072704


In [112]:
from sklearn.cluster import KMeans

kmean = KMeans(n_clusters=2, max_iter=100)
kmean.fit(tfidf_matrix)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [113]:
order_centroids = kmean.cluster_centers_.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()
for i in range(2):
    print("Cluster %d: " % i)
    for ind in order_centroids[i, :5]:
        print(' %s  ' % terms[ind])
print(kmean.cluster_centers_.shape)

Cluster 0: 
 amp  
 ac  
 apollo  
 aid  
 activity  
Cluster 1: 
 aspects  
 america  
 assistance  
 amendment  
 alternatives  
(2, 145)


In [110]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=2, max_iter=100)
lda.fit(X)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=100,
                          mean_change_tol=0.001, n_components=2, n_jobs=None,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

In [111]:
def print_top_words(model,feature_names,n_top_words):
    # 打印每个主题下权重较高的term
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print("\n".join([feature_names[i] 
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))


n_top_words=5
tf_feature_names = vectorizer.get_feature_names()
print_top_words(lda,tf_feature_names,n_top_words)

Topic #0:
ac
amp
assistance
aspects
author
Topic #1:
aspects
america
abortion
alternatives
arabs
