In [1]:

from sklearn.datasets import fetch_20newsgroups

categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]


groups = fetch_20newsgroups(subset='all', categories=categories)


In [2]:


labels = groups.target
label_names = groups.target_names


In [3]:
label_names

['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']

In [4]:

from nltk.corpus import names
all_names = set(names.words())




from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

data_cleaned = []

for doc in groups.data:
    doc = doc.lower()
    doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if word.isalpha() and word not in all_names)
    data_cleaned.append(doc_cleaned)


In [5]:

from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer(stop_words="english", max_features=None, max_df=0.5, min_df=2)


In [6]:

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vector = TfidfVectorizer(stop_words='english', max_features=None, max_df=0.5, min_df=2)

data = tfidf_vector.fit_transform(data_cleaned)



In [7]:

from sklearn.cluster import KMeans

k = 4
kmeans = KMeans(n_clusters=k, random_state=42)

kmeans.fit(data)

clusters = kmeans.labels_



from collections import Counter
print(Counter(clusters))


  super()._check_params_vs_input(X, default_n_init=10)


Counter({0: 1373, 3: 799, 1: 629, 2: 586})


In [8]:

import numpy as np
cluster_label = {i: labels[np.where(clusters == i)] for i in range(k)}

terms = tfidf_vector.get_feature_names_out()
centroids = kmeans.cluster_centers_
for cluster, index_list in cluster_label.items():
    counter = Counter(cluster_label[cluster])
    print('cluster_{}: {} samples'.format(cluster, len(index_list)))
    for label_index, count in sorted(counter.items(), key=lambda x: x[1], reverse=True):
        print('{}: {} samples'.format(label_names[label_index], count))
    print('Top 10 terms:')
    for ind in centroids[cluster].argsort()[-10:]:
        print(' %s' % terms[ind], end="")
    print()

cluster_0: 1373 samples
sci.space: 373 samples
comp.graphics: 366 samples
alt.atheism: 320 samples
talk.religion.misc: 314 samples
Top 10 terms:
 world computer new like know just ha university article wa
cluster_1: 629 samples
comp.graphics: 604 samples
sci.space: 23 samples
talk.religion.misc: 1 samples
alt.atheism: 1 samples
Top 10 terms:
 looking computer bit university need format program file graphic image
cluster_2: 586 samples
sci.space: 583 samples
alt.atheism: 1 samples
talk.religion.misc: 1 samples
comp.graphics: 1 samples
Top 10 terms:
 zoology just moon hst nasa mission launch wa shuttle space
cluster_3: 799 samples
alt.atheism: 477 samples
talk.religion.misc: 312 samples
sci.space: 8 samples
comp.graphics: 2 samples
Top 10 terms:
 moral morality jesus think article christian people say wa god
