In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# Example text documents
doc =[
    "Hockey, the thrilling game of speed and skill, unites fans with its electrifying goals and fierce competition. From icy rinks to roaring arenas, it’s a sport of pure adrenaline!",
    "Cricket, the gentleman's game, blends strategy and skill in every swing of the bat. From thrilling chases to stunning wickets, it unites fans worldwide."
]

# Step 1: Create a TfidfVectorizer instance
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(doc)  # Transform documents into a TF-IDF matrix

# Step 2: Create an NMF model
n_topics = 2  # Number of topics to extract
nmf = NMF(n_components=n_topics, random_state=42, max_iter=1000)

# Step 3: Fit the NMF model
W = nmf.fit_transform(X)  # Document-topic matrix
H = nmf.components_       # Topic-term matrix

# Step 4: Get the top words for each topic
feature_names = vectorizer.get_feature_names_out()

In [4]:
for idx, topic in enumerate(H):
    print(f"Topic {idx + 1}:")
    top_words_idx = topic.argsort()[-5:][::-1]  # Get top 5 words (largest weights)
    top_words = [feature_names[i] for i in top_words_idx]
    print(", ".join(top_words))

Topic 1:
worldwide, wickets, stunning, swing, strategy
Topic 2:
speed, roaring, rinks, sport, electrifying
