In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

In [14]:
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1,remove=('headers', 'footers', 'quotes'))
data = dataset.data

In [37]:
import nltk
import string
from nltk.corpus import stopwords

def textPrecessing(text):
    #小写化
    text = text.lower()
    #去除特殊标点
    for c in string.punctuation:
        text = text.replace(c, ' ')
    #分词
    wordLst = nltk.word_tokenize(text)
    #去除停用词
    filtered = [w for w in wordLst if w not in stopwords.words('english')]
    #仅保留名词或特定POS   
    refiltered =nltk.pos_tag(filtered)
    filtered = [w for w, pos in refiltered if pos.startswith('NN')]

    return " ".join(filtered)

In [38]:
#文本预处理
docLst = []
for desc in data :
    docLst.append(textPrecessing(desc).encode('utf-8'))

In [39]:
from sklearn.feature_extraction.text import CountVectorizer

#构建词频向量
vectorizer = CountVectorizer(max_features=2500,stop_words='english')
X = vectorizer.fit_transform(docLst)
print("n_samples: %d, n_features: %d" % X.shape)

n_samples: 11314, n_features: 2500


In [40]:
from sklearn.cluster import KMeans

kmean = KMeans(n_clusters=5, max_iter=100)
kmean.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=20, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [41]:
order_centroids = kmean.cluster_centers_.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()
for i in range(n_clusters):
    print("Cluster %d: " % i)
    for ind in order_centroids[i, :10]:
        print(' %s  ' % terms[ind])
print(kmean.cluster_centers_.shape)

Cluster 0: 
 people  
 time  
 way  
 use  
 problem  
 edu  
 year  
 years  
 god  
 space  
Cluster 1: 
 ax  
 max  
 pl  
 bhj  
 giz  
 gk  
 qax  
 bj  
 lj  
 z5  
Cluster 2: 
 ax  
 max  
 b8f  
 a86  
 g9v  
 giz  
 bhj  
 pl  
 bxn  
 wm  
Cluster 3: 
 ax  
 max  
 sl  
 g9v  
 giz  
 ql  
 okz  
 chz  
 fyn  
 pl  
Cluster 4: 
 ax  
 max  
 b8f  
 bhj  
 bh  
 a86  
 g9v  
 air  
 ah  
 q45  
Cluster 5: 
 ax  
 max  
 pl  
 b8f  
 bhj  
 giz  
 a86  
 g9v  
 p2  
 r8f  
Cluster 6: 
 ax  
 max  
 g9v  
 a86  
 b8f  
 bhj  
 pl  
 mg9v  
 giz  
 gk  
Cluster 7: 
 ax  
 max  
 a86  
 b8f  
 pl  
 bhj  
 qax  
 qq  
 giz  
 i4  
Cluster 8: 
 ax  
 max  
 pl  
 b8f  
 g9v  
 wm  
 bxn  
 bhj  
 giz  
 qax  
Cluster 9: 
 cx  
 scx  
 sc  
 s6  
 ck  
 gc  
 gcx  
 hz  
 chz  
 lk  
Cluster 10: 
 ax  
 max  
 a86  
 b8f  
 pl  
 giz  
 ql  
 bhj  
 g9v  
 cx  
Cluster 11: 
 w7  
 cx  
 t7  
 a7  
 ck  
 w1  
 hz  
 lk  
 chz  
 mv  
Cluster 12: 
 ax  
 g9v  
 b8f  
 a86  
 max  
 p

In [42]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=20, max_iter=100)
lda.fit(X)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=100,
                          mean_change_tol=0.001, n_components=20, n_jobs=None,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

In [48]:
def print_top_words(model,feature_names,n_top_words):
    # 打印每个主题下权重较高的term
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print("\n".join([feature_names[i] 
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))


n_top_words=10
tf_feature_names = vectorizer.get_feature_names()
print_top_words(lda,tf_feature_names,n_top_words)

Topic #0:
power
time
use
water
ground
way
wire
unit
work
line
Topic #1:
file
files
image
window
program
version
use
code
server
pub
Topic #2:
president
output
program
entry
file
mr
stephanopoulos
jobs
line
rules
Topic #3:
game
team
year
games
season
players
league
play
player
hockey
Topic #4:
people
time
way
day
things
thing
days
years
home
man
Topic #5:
evidence
point
question
time
fact
argument
example
people
book
case
Topic #6:
war
jews
armenians
people
government
israel
world
history
turks
population
Topic #7:
drive
disk
drives
controller
scsi
bus
card
bios
tape
data
Topic #8:
thanks
mail
price
sale
offer
list
hi
advance
post
information
Topic #9:
db
bike
vs
van
pts
mov
dod
la
cs
bh
Topic #10:
cx
chz
ah
w7
lk
hz
mv
scx
t7
ck
Topic #11:
gun
state
law
government
states
crime
rights
control
guns
police
Topic #12:
chip
encryption
government
keys
security
key
privacy
information
use
law
Topic #13:
software
windows
pc
card
problem
memory
apple
use
mac
bit
Topic #14:
god
jesus
people
life

In [49]:
help(KMeans)

Help on class KMeans in module sklearn.cluster._kmeans:

class KMeans(sklearn.base.TransformerMixin, sklearn.base.ClusterMixin, sklearn.base.BaseEstimator)
 |  KMeans(n_clusters=8, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=None, algorithm='auto')
 |  
 |  K-Means clustering.
 |  
 |  Read more in the :ref:`User Guide <k_means>`.
 |  
 |  Parameters
 |  ----------
 |  
 |  n_clusters : int, default=8
 |      The number of clusters to form as well as the number of
 |      centroids to generate.
 |  
 |  init : {'k-means++', 'random'} or ndarray of shape             (n_clusters, n_features), default='k-means++'
 |      Method for initialization, defaults to 'k-means++':
 |  
 |      'k-means++' : selects initial cluster centers for k-mean
 |      clustering in a smart way to speed up convergence. See section
 |      Notes in k_init for more details.
 |  
 |      'random': choose k observations (row

In [50]:
help(LatentDirichletAllocation)

Help on class LatentDirichletAllocation in module sklearn.decomposition._lda:

class LatentDirichletAllocation(sklearn.base.TransformerMixin, sklearn.base.BaseEstimator)
 |  LatentDirichletAllocation(n_components=10, doc_topic_prior=None, topic_word_prior=None, learning_method='batch', learning_decay=0.7, learning_offset=10.0, max_iter=10, batch_size=128, evaluate_every=-1, total_samples=1000000.0, perp_tol=0.1, mean_change_tol=0.001, max_doc_update_iter=100, n_jobs=None, verbose=0, random_state=None)
 |  
 |  Latent Dirichlet Allocation with online variational Bayes algorithm
 |  
 |  .. versionadded:: 0.17
 |  
 |  Read more in the :ref:`User Guide <LatentDirichletAllocation>`.
 |  
 |  Parameters
 |  ----------
 |  n_components : int, optional (default=10)
 |      Number of topics.
 |  
 |  doc_topic_prior : float, optional (default=None)
 |      Prior of document topic distribution `theta`. If the value is None,
 |      defaults to `1 / n_components`.
 |      In [1]_, this is calle