In [37]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import nltk
from nltk.corpus import stopwords

In [None]:
nltk.download("stopwords")

In [39]:
documents = ["USA is an advanced military country.", "Today is sunny.",
             "The earth is moving.", "Barcelona has won the match."]

In [40]:
tfidf_vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

In [41]:
print(f"shape of the matrix: {tfidf_matrix.shape}")

shape of the matrix: (4, 11)


In [42]:
tfidf_vectorizer.get_feature_names_out()

array(['advanced', 'barcelona', 'country', 'earth', 'match', 'military',
       'moving', 'sunny', 'today', 'usa', 'won'], dtype=object)

In [43]:
num_topics = 3

lsa_model = TruncatedSVD(n_components=num_topics, random_state=42)
lsa_matrix = lsa_model.fit_transform(tfidf_matrix)

In [44]:
terms = tfidf_vectorizer.get_feature_names_out()

for i, component in enumerate(lsa_model.components_):
    terms_in_topic = zip(terms, component)

    # selecting top 3 words
    sorted_terms = sorted(terms_in_topic, key=lambda x: x[1], reverse=True)[:3]
    print(f"topic: {i+1}; {', '.join(term for term, _ in sorted_terms)}")

topic: 1; sunny, today, earth
topic: 2; match, won, barcelona
topic: 3; earth, moving, match
