In [57]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from time import time
from sklearn.datasets import load_files

print("loading documents ...")
t = time()
docs = load_files('datasets/clustering/data')
print("summary: {0} documents in {1} categories.".format(
    len(docs.data), len(docs.target_names)))
print("done in {0} seconds".format(time() - t))

loading documents ...
summary: 3949 documents in 4 categories.
done in 0.0692269802094 seconds


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

max_features = 20000
print("vectorizing documents ...")
t = time()
vectorizer = TfidfVectorizer(max_df=0.4, 
                             min_df=2, 
                             max_features=max_features, 
                             encoding='latin-1')
X = vectorizer.fit_transform((d for d in docs.data))
print("n_samples: %d, n_features: %d" % X.shape)
print("number of non-zero features in sample [{0}]: {1}".format(
    docs.filenames[0], X[0].getnnz()))
print("done in {0} seconds".format(time() - t))

vectorizing documents ...
n_samples: 3949, n_features: 20000
number of non-zero features in sample [datasets/clustering/data/sci.electronics/11902-54322]: 56
done in 1.22917580605 seconds


In [9]:
from sklearn.cluster import KMeans

print("clustering documents ...")
t = time()
n_clusters = 4
kmean = KMeans(n_clusters=n_clusters, 
               max_iter=100,
               tol=0.01,
               verbose=1,
               n_init=3)
kmean.fit(X);
print("kmean: k={}, cost={}".format(n_clusters, int(kmean.inertia_)))
print("done in {0} seconds".format(time() - t))

clustering documents ...
Initialization complete
Iteration  0, inertia 7548.338
Iteration  1, inertia 3845.294
Iteration  2, inertia 3835.689
Iteration  3, inertia 3832.981
Iteration  4, inertia 3831.389
Iteration  5, inertia 3830.634
Iteration  6, inertia 3829.925
Iteration  7, inertia 3828.363
Iteration  8, inertia 3825.850
Iteration  9, inertia 3823.059
Iteration 10, inertia 3821.834
Iteration 11, inertia 3821.499
Iteration 12, inertia 3821.272
Iteration 13, inertia 3821.124
Iteration 14, inertia 3821.076
Iteration 15, inertia 3821.041
Iteration 16, inertia 3821.025
Iteration 17, inertia 3821.018
Iteration 18, inertia 3821.016
Converged at iteration 18
Initialization complete
Iteration  0, inertia 7484.695
Iteration  1, inertia 3842.812
Iteration  2, inertia 3834.243
Iteration  3, inertia 3832.220
Iteration  4, inertia 3831.228
Iteration  5, inertia 3830.460
Iteration  6, inertia 3829.684
Iteration  7, inertia 3829.119
Iteration  8, inertia 3828.739
Iteration  9, inertia 3828.421
It

In [15]:
len(kmean.labels_)

3949

In [50]:
kmean.labels_[1000:1010]

array([1, 1, 1, 0, 3, 0, 3, 1, 0, 0], dtype=int32)

In [51]:
docs.filenames[1000:1010]

array(['datasets/clustering/data/sci.crypt/10888-15289',
       'datasets/clustering/data/sci.crypt/11490-15880',
       'datasets/clustering/data/sci.crypt/11270-15346',
       'datasets/clustering/data/sci.electronics/12383-53525',
       'datasets/clustering/data/sci.space/13826-60862',
       'datasets/clustering/data/sci.electronics/11631-54106',
       'datasets/clustering/data/sci.space/14235-61437',
       'datasets/clustering/data/sci.crypt/11508-15928',
       'datasets/clustering/data/sci.space/13593-60824',
       'datasets/clustering/data/sci.electronics/12304-52801'], 
      dtype='|S52')

In [52]:
from __future__ import print_function

print("Top terms per cluster:")

order_centroids = kmean.cluster_centers_.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()
for i in range(n_clusters):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Top terms per cluster:
Cluster 0: my any me by know your some do so has
Cluster 1: key clipper chip encryption government keys will escrow we by
Cluster 2: pat digex access hst prb net mission shuttle steve servicing
Cluster 3: space henry nasa toronto moon zoo spencer launch gov alaska


In [53]:
a = np.array([[20, 10, 30, 40], [100, 300, 200, 400], [1, 5, 3, 2]])
a.argsort()[:, ::-1]

array([[3, 2, 0, 1],
       [3, 1, 2, 0],
       [1, 2, 3, 0]])

In [54]:
a = np.array([10, 30, 20, 40])
a.argsort()[::-1]

array([3, 1, 2, 0])

In [86]:
from sklearn import metrics

label_true = np.random.randint(1, 4, 6)
label_pred = np.random.randint(1, 4, 6)
print("Adjusted Rand-Index for random sample: %.3f"
      % metrics.adjusted_rand_score(label_true, label_pred))
label_true = [1, 1, 3, 3, 2, 2]
label_pred = [3, 3, 2, 2, 1, 1]
print("Adjusted Rand-Index for same structure sample: %.3f"
      % metrics.adjusted_rand_score(label_true, label_pred))

Adjusted Rand-Index for random sample: -0.023
Adjusted Rand-Index for same structure sample: 1.000


In [109]:
from sklearn import metrics

label_true = [1, 1, 2, 2]
label_pred = [2, 2, 1, 1]
print("Homogeneity score for same structure sample: %.3f"
      % metrics.homogeneity_score(label_true, label_pred))
label_true = [1, 1, 2, 2]
label_pred = [0, 1, 2, 3]
print("Homogeneity score for each cluster come from only one class: %.3f"
      % metrics.homogeneity_score(label_true, label_pred))
label_true = [1, 1, 2, 2]
label_pred = [1, 2, 1, 2]
print("Homogeneity score for each cluster come from two class: %.3f"
      % metrics.homogeneity_score(label_true, label_pred))
label_true = np.random.randint(1, 4, 6)
label_pred = np.random.randint(1, 4, 6)
print("Homogeneity score for random sample: %.3f"
      % metrics.homogeneity_score(label_true, label_pred))

Homogeneity score for same structure sample: 1.000
Homogeneity score for each cluster come from only one class: 1.000
Homogeneity score for each cluster come from two class: 0.000
Homogeneity score for random sample: 0.633


In [116]:
from sklearn import metrics

label_true = [1, 1, 2, 2]
label_pred = [2, 2, 1, 1]
print("Completeness score for same structure sample: %.3f"
      % metrics.completeness_score(label_true, label_pred))
label_true = [0, 1, 2, 3]
label_pred = [1, 1, 2, 2]
print("Completeness score for each class assign to only one cluster: %.3f"
      % metrics.completeness_score(label_true, label_pred))
label_true = [1, 1, 2, 2]
label_pred = [1, 2, 1, 2]
print("Completeness score for each class assign to two class: %.3f"
      % metrics.completeness_score(label_true, label_pred))
label_true = np.random.randint(1, 4, 6)
label_pred = np.random.randint(1, 4, 6)
print("Completeness score for random sample: %.3f"
      % metrics.completeness_score(label_true, label_pred))

Completeness score for same structure sample: 1.000
Completeness score for each class assign to only one cluster: 1.000
Completeness score for each class assign to two class: 0.000
Completeness score for random sample: 0.159


In [119]:
from sklearn import metrics

label_true = [1, 1, 2, 2]
label_pred = [2, 2, 1, 1]
print("V-measure score for same structure sample: %.3f"
      % metrics.v_measure_score(label_true, label_pred))
label_true = [0, 1, 2, 3]
label_pred = [1, 1, 2, 2]
print("V-measure score for each class assign to only one cluster: %.3f"
      % metrics.v_measure_score(label_true, label_pred))
print("V-measure score for each class assign to only one cluster: %.3f"
      % metrics.v_measure_score(label_pred, label_true))
label_true = [1, 1, 2, 2]
label_pred = [1, 2, 1, 2]
print("V-measure score for each class assign to two class: %.3f"
      % metrics.v_measure_score(label_true, label_pred))

V-measure score for same structure sample: 1.000
V-measure score for each class assign to only one cluster: 0.667
V-measure score for each class assign to only one cluster: 0.667
V-measure score for each class assign to two class: 0.000


In [121]:
from sklearn import metrics

labels = docs.target
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, kmean.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, kmean.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, kmean.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, kmean.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, kmean.labels_, sample_size=1000))

Homogeneity: 0.351
Completeness: 0.505
V-measure: 0.414
Adjusted Rand-Index: 0.228
Silhouette Coefficient: 0.004
