# LSA

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import pandas as pd
from gensim.models import LsiModel
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
import gensim
from pprint import pprint

In [2]:
documents = [
    "I love natural language processing.",
    "Understanding language is fascinating.",
    "Processing language data is essential."
]

In [3]:
# Remove empty documents
documents = [doc for doc in documents if doc.strip()]

In [4]:
# Create a document-term matrix using TF-IDF representation
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

In [5]:
# Convert the sparse TF-IDF matrix to a Gensim corpus
corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)

In [6]:
# Create a Gensim Dictionary
dictionary = Dictionary.from_corpus(corpus, id2word=dict((id, word) for word, id in vectorizer.vocabulary_.items()))

In [7]:
# Apply Latent Semantic Analysis (LSA) using Gensim
num_topics = 2
lsa_model = LsiModel(corpus, id2word=dictionary, num_topics=num_topics)

In [8]:
# Print the topics and associated words
pprint(lsa_model.print_topics(num_words=5))

[(0,
  '0.511*"language" + 0.481*"processing" + 0.316*"natural" + 0.316*"love" + '
  '0.316*"essential"'),
 (1,
  '0.619*"fascinating" + 0.619*"understanding" + -0.278*"processing" + '
  '-0.183*"natural" + -0.183*"love"')]


In [9]:
# Display document-topic assignments
for i, doc in enumerate(corpus):
    topics = lsa_model[doc]
    print(f"Document {i + 1} - Topic Assignments: {topics}")

Document 1 - Topic Assignments: [(0, 0.7597202301121123), (1, -0.28496475286176776)]
Document 2 - Topic Assignments: [(0, 0.49995544343291565), (1, 0.8660511270022092)]
Document 3 - Topic Assignments: [(0, 0.759720230112112), (1, -0.2849647528617667)]


In [10]:
# Calculate coherence score
coherence_model = CoherenceModel(model=lsa_model, texts=[doc.split() for doc in documents], dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()

print(f"Coherence Score: {coherence_score}")

Coherence Score: nan


  m_lr_i = np.log(numerator / denominator)
  return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))
