In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [2]:
document = [
  "Science and technology drive innovation in healthcare and industry.",
    "New advancements in AI are transforming various sectors of business.",
    "Financial markets are influenced by geopolitical events and economic indicators.",
    "The latest blockbuster movie has broken box office records worldwide.",
    "Sports events bring communities together and promote healthy lifestyles.",
    "Breakthroughs in renewable energy technologies are crucial for sustainability."
]

In [3]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=1, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(document)
num_topics = 3
nmf_model = NMF(n_components = num_topics, random_state=42)
nmf_model.fit(tfidf_matrix)
feature_names = tfidf_vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(nmf_model.components_):
  print(f"Topic {topic_idx + 1}:")
  top_words_idx = topic.argsort()[:-6:-1]
  top_words = [feature_names[i] for i in top_words_idx]
  print(top_words)
topic_distribution = nmf_model.transform(tfidf_matrix)
clusters = topic_distribution.argmax(axis=1)
for i, cluster in enumerate(clusters):
  print(f"Document {i + 1} belongs to cluster {cluster + 1}")

Topic 1:
['events', 'bring', 'lifestyles', 'communities', 'sports']
Topic 2:
['worldwide', 'records', 'blockbuster', 'box', 'movie']
Topic 3:
['advancements', 'new', 'transforming', 'ai', 'business']
Document 1 belongs to cluster 3
Document 2 belongs to cluster 3
Document 3 belongs to cluster 1
Document 4 belongs to cluster 2
Document 5 belongs to cluster 1
Document 6 belongs to cluster 2
