In [None]:
# !pip install scikit-learn

In [None]:
# Import necessary libraries
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import homogeneity_score, silhouette_score
import matplotlib.pyplot as plt
import pandas as pd

# Step 1: Load the Dataset
newsgroups = fetch_20newsgroups(subset='all')

# For computational efficiency, you might want to use a subset
texts = newsgroups.data  # Use all data or limit to a subset

# Step 2: Preprocess the Text (Basic preprocessing)
# Remove headers, footers, quotes (optional)

# Step 3: Feature Extraction
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5)
X = vectorizer.fit_transform(texts)

# Step 4: Dimensionality Reduction
pca = PCA(n_components=2)
X_reduced = pca.fit_transform(X.toarray())

# Step 5: Apply Clustering Algorithm
num_clusters = 20  # Since we have 20 newsgroups
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X)

# Step 6: Evaluate and Visualize Clusters
labels = kmeans.labels_
true_labels = newsgroups.target

# Evaluate clustering performance
homogeneity = homogeneity_score(true_labels, labels)
silhouette = silhouette_score(X, labels, sample_size=1000)

print(f"Homogeneity Score: {homogeneity}")
print(f"Silhouette Score: {silhouette}")

# Visualization
plt.figure(figsize=(12, 8))
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=labels, cmap='rainbow', alpha=0.5)
plt.title('K-Means Clustering of 20 Newsgroups')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()

In [None]:
# from sklearn.cluster import KMeans
# from sklearn.decomposition import PCA
# import matplotlib.pyplot as plt

# # Define the number of clusters
# num_clusters = 4

# # Apply K-Means clustering
# kmeans = KMeans(n_clusters=num_clusters, random_state=42)
# kmeans.fit(X_train_tfidf)

# # Predict cluster labels for the test data
# y_kmeans = kmeans.predict(X_test_tfidf)

# # Visualizing the clusters using PCA (for 2D plotting)
# pca = PCA(n_components=2, random_state=42)
# X_test_pca = pca.fit_transform(X_test_tfidf.toarray())

# # Plot the clusters
# plt.figure(figsize=(10, 8))
# plt.scatter(X_test_pca[:, 0], X_test_pca[:, 1], c=y_kmeans, cmap='rainbow')
# plt.title('K-Means Clustering of 20 Newsgroups Dataset')
# plt.xlabel('PCA Component 1')
# plt.ylabel('PCA Component 2')
# plt.show()
