In [4]:
import numpy as np
from collections import defaultdict
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.datasets import fetch_20newsgroups # importing dataset

### Evaluating Function

In [1]:
# evaluations = []
# evaluations_std = []


# def fit_and_evaluate(km, X, name=None, n_runs=5):
#     name = km.__class__.__name__ if name is None else name

#     train_times = []
#     scores = defaultdict(list)
#     for seed in range(n_runs):
#         km.set_params(random_state=seed)
#         t0 = time()
#         km.fit(X)
#         train_times.append(time() - t0)
#         scores["Homogeneity"].append(metrics.homogeneity_score(labels, km.labels_))
#         scores["Completeness"].append(metrics.completeness_score(labels, km.labels_))
#         scores["V-measure"].append(metrics.v_measure_score(labels, km.labels_))
#         scores["Adjusted Rand-Index"].append(
#             metrics.adjusted_rand_score(labels, km.labels_)
#         )
#         scores["Silhouette Coefficient"].append(
#             metrics.silhouette_score(X, km.labels_, sample_size=2000)
#         )
#     train_times = np.asarray(train_times)

#     print(f"clustering done in {train_times.mean():.2f} ± {train_times.std():.2f} s ")
#     evaluation = {
#         "estimator": name,
#         "train_time": train_times.mean(),
#     }
#     evaluation_std = {
#         "estimator": name,
#         "train_time": train_times.std(),
#     }
#     for score_name, score_values in scores.items():
#         mean_score, std_score = np.mean(score_values), np.std(score_values)
#         print(f"{score_name}: {mean_score:.3f} ± {std_score:.3f}")
#         evaluation[score_name] = mean_score
#         evaluation_std[score_name] = std_score
#     evaluations.append(evaluation)
#     evaluations_std.append(evaluation_std)

### Code

In [7]:
dataset = fetch_20newsgroups(subset="all")

In [8]:
print(dataset.target_names) # printing all the categories

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [9]:
# defining topics
categories = [
    "comp.os.ms-windows.misc",
    "comp.sys.mac.hardware",
    "talk.politics.guns",
    "sci.space",
]

# taking away unwanted parts of the data this prevents classifiers overfitting on metadata 
dataset = fetch_20newsgroups(
    remove=("headers", "footers", "quotes"),
    subset="all",
    categories=categories,
    shuffle=True,
    random_state=42,
)

# printing the number of documents and categories
labels = dataset.target
unique_labels, category_sizes = np.unique(labels, return_counts=True)
true_k = unique_labels.shape[0]

print(f"{len(dataset.data)} documents - {true_k} categories")

3845 documents - 4 categories


In [11]:
vectorizer = TfidfVectorizer(
    max_df=0.5, # ignore terms that appear in more than 50% of docs
    min_df=5, # ignore terms that arent in at least 5 docs
    stop_words="english", # this removes common english dialect words such as "then, the, and etc"
)
t0 = time() # records the time of vectorisation
X_tfidf = vectorizer.fit_transform(dataset.data)

print(f"vectorization done in {time() - t0:.3f} s")
print(f"n_samples: {X_tfidf.shape[0]}, n_features: {X_tfidf.shape[1]}")

vectorization done in 0.381 s
n_samples: 3845, n_features: 8397


TfidVectorizer uses an in-memory vocabulary to map most frequent words to features indices. 

n_features would contain the resulting number of unique terms after we used the parameters to cut down the search

In [12]:
print(f"{X_tfidf.nnz / np.prod(X_tfidf.shape):.3f}")

0.006


The above quantifys the sparsity of the X_tfidf matrix as the fraction of non-zero entries / total no. of elements