In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.datasets import fetch_20newsgroups
import matplotlib.pyplot as plt

# Load a subset of the 20 newsgroups dataset
categories = ['sci.space', 'rec.autos', 'comp.graphics']
newsgroups = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Step 1: TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5)
X = vectorizer.fit_transform(newsgroups.data)

# Step 2: Apply K-Means clustering
true_k = 3
kmeans = KMeans(n_clusters=true_k, random_state=42)
kmeans.fit(X)

# Step 3: Print top terms per cluster
print("Top terms per cluster:")
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(true_k):
    top_terms = [terms[ind] for ind in order_centroids[i, :10]]
    print(f"Cluster {i}: {', '.join(top_terms)}")

# Optional: Predict the cluster for new text
new_doc = ["The new graphics card improves rendering speed significantly."]
new_vec = vectorizer.transform(new_doc)
predicted = kmeans.predict(new_vec)
print(f"\nNew document assigned to cluster: {predicted[0]}")


Top terms per cluster:
Cluster 0: thanks, graphics, image, know, file, does, files, program, format, like
Cluster 1: car, cars, just, like, engine, good, think, don, new, know
Cluster 2: space, nasa, launch, shuttle, orbit, moon, just, think, people, earth

New document assigned to cluster: 0
