In [1]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [2]:
train_dir = './20news-bydate/20news-bydate-train'
test_dir = './20news-bydate/20news-bydate-test'

In [3]:
# read files and return data frames
def read_files(directory):
    data = []
    target = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            with open(os.path.join(root, file), 'r', encoding='latin1') as f:
                text = f.read()
                data.append(text)
                target.append(os.path.basename(root))
    return pd.DataFrame({'text': data, 'target': target})

In [4]:
train_data = read_files(train_dir)
test_data = read_files(test_dir)

In [5]:
# combine training and testing data
data = pd.concat([train_data, test_data])

In [6]:
# vectorize text data
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(data['text'])

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [None]:
num_clusters = 20
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X)

In [None]:
# add cluster labels to the dataframe
data['cluster'] = kmeans.labels_

In [None]:
plt.figure(figsize=(10, 7))
plt.hist(data['cluster'], bins=num_clusters, edgecolor='k')
plt.title('Distribution of Clusters')
plt.xlabel('Cluster')
plt.ylabel('Number of Documents')
plt.show()

In [None]:
from sklearn.decomposition import PCA

# reduce the dimensionality of the data using PCA
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(X.toarray())

In [None]:
plt.figure(figsize=(10, 7))
scatter = plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=data['cluster'], cmap='viridis', alpha=0.5)
plt.colorbar(scatter)
plt.title('K-means Clustering of 20news-bydate Data')
plt.xlabel('Component 1') # axis along which the data are the most spread out
plt.ylabel('Component 2') # capturing the second most variance in the data
plt.show()

In [None]:
# top terms in each clusters
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

In [None]:
# plot top terms for each cluster
for i in range(num_clusters):
    top_terms = [terms[ind] for ind in order_centroids[i, :10]]
    top_weights = kmeans.cluster_centers_[i, order_centroids[i, :10]]

    plt.figure(figsize=(10, 5))
    plt.barh(top_terms, top_weights, color='b', align='center')
    plt.xlabel('Weight')
    plt.title(f'Top Terms for Cluster {i}')
    plt.gca().invert_yaxis()  # Invert y-axis to have the top term at the top
    plt.show()

In [None]:
for i in range(num_clusters):
    print(f"Cluster {i}:")
    for ind in order_centroids[i, :10]:
        print(f" {terms[ind]}")
    print()