# Topic Modeling using LSA(Latent semantic analysis)

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [2]:
documents = [
    "I love natural language processing.",
    "Understanding language is fascinating.",
    "Processing language data is essential."
]

In [3]:
# Create a document-term matrix using TF-IDF representation
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

In [4]:
# Apply Latent Semantic Analysis (LSA)
num_topics = 2
lsa_model = TruncatedSVD(n_components=num_topics)
lsa_topic_matrix = lsa_model.fit_transform(X)

In [5]:
# Display the topics
terms = vectorizer.get_feature_names_out()
terms

array(['data', 'essential', 'fascinating', 'language', 'love', 'natural',
       'processing', 'understanding'], dtype=object)

In [6]:
topic_names = [f"Topic {i + 1}" for i in range(num_topics)]
topic_names

['Topic 1', 'Topic 2']

In [7]:
df_topics = pd.DataFrame(lsa_model.components_, columns=terms, index=topic_names).T
df_topics

Unnamed: 0,Topic 1,Topic 2
data,0.316202,-0.182537
essential,0.316202,-0.182537
fascinating,0.232297,0.619308
language,0.510706,0.150154
love,0.316202,-0.182537
natural,0.316202,-0.182537
processing,0.480959,-0.277649
understanding,0.232297,0.619308


In [8]:
print("Document-Term Matrix:")
print(pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out()))

Document-Term Matrix:
       data  essential  fascinating  language      love   natural  processing  \
0  0.000000   0.000000     0.000000  0.345205  0.584483  0.584483    0.444514   
1  0.000000   0.000000     0.652491  0.385372  0.000000  0.000000    0.000000   
2  0.584483   0.584483     0.000000  0.345205  0.000000  0.000000    0.444514   

   understanding  
0       0.000000  
1       0.652491  
2       0.000000  


In [9]:
print("\nLSA Topic Matrix:")
print(df_topics)


LSA Topic Matrix:
                Topic 1   Topic 2
data           0.316202 -0.182537
essential      0.316202 -0.182537
fascinating    0.232297  0.619308
language       0.510706  0.150154
love           0.316202 -0.182537
natural        0.316202 -0.182537
processing     0.480959 -0.277649
understanding  0.232297  0.619308


In [10]:
# Optional: Visualize the document-topic matrix
df_document_topics = pd.DataFrame(lsa_topic_matrix, columns=topic_names)
print("\nDocument-Topic Matrix:")
print(df_document_topics)


Document-Topic Matrix:
    Topic 1   Topic 2
0  0.759720 -0.284965
1  0.499955  0.866051
2  0.759720 -0.284965
