In [22]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
import numpy as np

from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys
from time import time

In [23]:
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
# Uncomment the following to do the analysis on all the categories
# categories = None

print("Loading 20 newsgroups dataset for categories:")
categories

Loading 20 newsgroups dataset for categories:


['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

In [24]:
dataset = fetch_20newsgroups(subset='all', categories=categories,
                             shuffle=True, random_state=42)

labels = dataset.target
true_k = np.unique(labels).shape[0]

print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))

3387 documents
4 categories


In [25]:
t0 = time()
vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000,
                                 min_df=2, stop_words='english',
                                 use_idf=True)
X = vectorizer.fit_transform(dataset.data)
print(X)
print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)

  (0, 2839)	0.04500071507017083
  (0, 1414)	0.07540974760886272
  (0, 8358)	0.04976131971209157
  (0, 1108)	0.0841691125267977
  (0, 4745)	0.0841691125267977
  (0, 2670)	0.08189317880141146
  (0, 2244)	0.07995160596271345
  (0, 6946)	0.05452908892155513
  (0, 9698)	0.06016650695202247
  (0, 3540)	0.08913144120477544
  (0, 2324)	0.05804798267620172
  (0, 9211)	0.043848362254850295
  (0, 3401)	0.054009661453737326
  (0, 3001)	0.028323774603401228
  (0, 6174)	0.06071840775718862
  (0, 2190)	0.0903937008980028
  (0, 8443)	0.07047414689713123
  (0, 6960)	0.060439582663268904
  (0, 6175)	0.06805953438858334
  (0, 2829)	0.08913144120477544
  (0, 7709)	0.0805677032230983
  (0, 1084)	0.05302819749691229
  (0, 1760)	0.05851683527589977
  (0, 9881)	0.03326725261069873
  (0, 7923)	0.04852708902991879
  :	:
  (3386, 6705)	0.04636574358198852
  (3386, 208)	0.073055490407179
  (3386, 8947)	0.058413220065501995
  (3386, 6133)	0.05509573409226583
  (3386, 5417)	0.12261715332194234
  (3386, 4683)	0.1508

In [26]:
print("Performing dimensionality reduction using LSA")
t0 = time()
# Vectorizer results are normalized, which makes KMeans behave as
# spherical k-means for better results. Since LSA/SVD results are
# not normalized, we have to redo the normalization.
svd = TruncatedSVD(10)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

X = lsa.fit_transform(X)

print("done in %fs" % (time() - t0))

explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(
    int(explained_variance * 100)))

print()

Performing dimensionality reduction using LSA
done in 0.116989s
Explained variance of the SVD step: 5%



In [29]:
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
                verbose=False)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

print()

print("Top terms per cluster:")

if True:
    original_space_centroids = svd.inverse_transform(km.cluster_centers_)
    order_centroids = original_space_centroids.argsort()[:, ::-1]
else:
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()


Clustering sparse data with KMeans(max_iter=100, n_clusters=4, n_init=1, verbose=False)
done in 0.016s

Homogeneity: 0.608
Completeness: 0.618
V-measure: 0.613
Adjusted Rand-Index: 0.632
Silhouette Coefficient: 0.313

Top terms per cluster:
Cluster 0: graphics image com file files university ac thanks images 3d
Cluster 1: god jesus people com don bible believe just think say
Cluster 2: space nasa gov alaska access shuttle com digex henry just
Cluster 3: com sandvik keith morality sgi objective caltech livesey kent moral
