In [24]:
from sklearn.datasets import fetch_20newsgroups

def load_newsgroup_data(categories=None, subset='train'):

    newsgroups = fetch_20newsgroups(categories=categories, subset=subset)
    return newsgroups

# Example usage
categories = ['comp.graphics', 'talk.politics.guns', 'alt.atheism', 'sci.med', 'sci.space']
data = load_newsgroup_data(categories)

In [25]:
import nltk
import pandas as pd
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import silhouette_score
from sklearn.decomposition import NMF
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()                                # Lowercase all letters
    text = re.sub(r'\d+', '', text)                    # Remove digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)            # Remove punctuation
    tokens = text.split()                              # Tokenize
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

In [28]:
newsgroups_data = pd.DataFrame({'text': data.data, 'target': data.target})
newsgroups_data['text'] = newsgroups_data['text'].apply(preprocess_text)

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf_sparse = vectorizer.fit_transform(newsgroups_data['text'])

print(f"Dimensions of TF-IDF matrix: {X_tfidf_sparse.shape}")
print(f"Feature names: {vectorizer.get_feature_names_out()[:10]}")


Dimensions of TF-IDF matrix: (2797, 41771)
Feature names: ['aa' 'aaa' 'aaaa' 'aaah' 'aaai' 'aaaimit' 'aacalcomsocalcom'
 'aadscrsiemenscom' 'aaffff' 'aafffff']


### **NMF using scikit-learn library**

In [30]:
from sklearn.decomposition import NMF

X_tfidf = X_tfidf_sparse.toarray()

n_topics = 5


nmf_model = NMF(n_components=n_topics, random_state=42)
W = nmf_model.fit_transform(X_tfidf)
H = nmf_model.components_


cluster_labels = np.argmax(W, axis=1)


newsgroups_data['cluster'] = cluster_labels


print(newsgroups_data[['target', 'cluster']].head(20))


    target  cluster
0        4        3
1        1        2
2        3        2
3        3        2
4        4        3
5        3        2
6        2        4
7        4        3
8        4        3
9        2        3
10       1        2
11       3        2
12       1        2
13       3        2
14       0        0
15       2        2
16       2        2
17       2        4
18       0        0
19       4        3


In [31]:
k = 5
nmf = NMF(n_components=k, random_state=42)

nmf_features = nmf.fit_transform(X_tfidf)

nmf_labels = nmf_features.argmax(axis=1)

In [32]:
#ARI
ari = adjusted_rand_score(newsgroups_data.target, nmf_labels)
print(f"Adjusted Rand Index (ARI) for NMF: {ari:.4f}")

#AMI
ami = adjusted_mutual_info_score(newsgroups_data.target, nmf_labels)
print(f"Adjusted Mutual Information (AMI) for NMF: {ami:.4f}")

#SILHOUTTE
sil_score = silhouette_score(X_tfidf, nmf_labels)
print(f"Silhouette Score for NMF: {sil_score:.4f}")

Adjusted Rand Index (ARI) for NMF: 0.4081
Adjusted Mutual Information (AMI) for NMF: 0.5746
Silhouette Score for NMF: 0.0073


## **K-means using scikit-learn library**

In [33]:
from sklearn.cluster import KMeans

n_clusters = 5


kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans_labels = kmeans.fit_predict(X_tfidf)


newsgroups_data['kmeans_cluster'] = kmeans_labels


print(newsgroups_data[['target', 'kmeans_cluster']].head(20))


    target  kmeans_cluster
0        4               3
1        1               2
2        3               1
3        3               1
4        4               3
5        3               1
6        2               0
7        4               3
8        4               3
9        2               1
10       1               2
11       3               1
12       1               2
13       3               1
14       0               0
15       2               1
16       2               1
17       2               0
18       0               0
19       4               3


In [34]:
k = 5
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(X_tfidf)


In [35]:
#ARI
ari = adjusted_rand_score(newsgroups_data.target, kmeans_labels)
print(f"Adjusted Rand Index (ARI): {ari:.4f}")

#AMI
ami = adjusted_mutual_info_score(newsgroups_data.target, kmeans_labels)
print(f"Adjusted Mutual Information (AMI): {ami:.4f}")

#Silhoutte
sil_score = silhouette_score(X_tfidf, kmeans_labels)
print(f"Silhouette Score: {sil_score:.4f}")

Adjusted Rand Index (ARI): 0.4695
Adjusted Mutual Information (AMI): 0.6222
Silhouette Score: 0.0075


## **K-means implementation**

In [37]:
from scipy.sparse import issparse

if issparse(X_tfidf):
    X = X_tfidf.toarray()
else:
    X = X_tfidf

In [38]:
def initialize_centroids(X, k):
    np.random.seed(42)
    indices = np.random.choice(X.shape[0], k, replace=False)
    return X[indices]

def assign_clusters(X, centroids):
    distances = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)
    return np.argmin(distances, axis=1)

def update_centroids(X, labels, k):
    centroids = np.zeros((k, X.shape[1]))
    for i in range(k):
        points = X[labels == i]
        if len(points) > 0:
            centroids[i] = np.mean(points, axis=0)
    return centroids

def kmeans(X, k, max_iters=100):
    centroids = initialize_centroids(X, k)
    for _ in range(max_iters):
        labels = assign_clusters(X, centroids)
        new_centroids = update_centroids(X, labels, k)
        if np.allclose(centroids, new_centroids):
            break
        centroids = new_centroids
    return labels, centroids

In [39]:
k = 5
labels, centroids = kmeans(X, k)

In [40]:
#ARI
print("ARI (manual KMeans):", adjusted_rand_score(data.target, labels))

#AMI
ami = adjusted_mutual_info_score(newsgroups_data['target'], labels)
print(f"Adjusted Mutual Information (AMI) for manual KMeans: {ami:.4f}")

#Silhouette
sil_score = silhouette_score(X, labels)
print(f"Silhouette Score for manual KMeans: {sil_score:.4f}")


ARI (manual KMeans): 0.4151998403454855
Adjusted Mutual Information (AMI) for manual KMeans: 0.4955
Silhouette Score for manual KMeans: 0.0049
