In [1]:
from gensim import corpora, models
from pprint import pprint

In [2]:
documents = [
    "Artificial intelligence and machine learning are revolutionizing the tech industry.",
    "The Internet of Things connects devices and enables smart homes.",
    "Cybersecurity is crucial in today's digital landscape to protect against online threats."
]

In [3]:
# Tokenize and preprocess the documents
tokenized_documents = [doc.lower().split() for doc in documents]

In [4]:
# Create a dictionary and corpus
dictionary = corpora.Dictionary(tokenized_documents)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents]

In [6]:
# Build the LDA model
lda_model = models.LdaModel(corpus, num_topics=3,id2word=dictionary, passes=10)

In [7]:
# Print the topics and associated words
pprint(lda_model.print_topics(num_words=5))

[(0,
  '0.042*"tech" + 0.042*"intelligence" + 0.042*"industry." + '
  '0.042*"artificial" + 0.042*"machine"'),
 (1,
  '0.067*"and" + 0.067*"the" + 0.067*"things" + 0.067*"smart" + '
  '0.067*"internet"'),
 (2,
  '0.034*"the" + 0.034*"devices" + 0.034*"enables" + 0.033*"of" + 0.033*"and"')]


In [8]:
# Get the topic distribution for each document
for i, doc in enumerate(corpus):
    print(f"Document {i + 1}: {lda_model[doc]}")

Document 1: [(0, 0.9367447), (1, 0.032226354), (2, 0.031028917)]
Document 2: [(0, 0.030942805), (1, 0.9383053), (2, 0.030751828)]
Document 3: [(0, 0.9478095), (1, 0.025926992), (2, 0.026263498)]


In [9]:
from gensim.models import CoherenceModel

# Calculate coherence score
coherence_model = CoherenceModel(model=lda_model, texts=tokenized_documents, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()

print(f"Coherence Score: {coherence_score}")

Coherence Score: 0.18733377511418836


In [10]:
max_coherence_score = -1
best_params = {}

alpha_values = ['symmetric',0.01,0.1,0.2,0.3,'auto']
beta_values = ['auto', 0.01, 0.1,0.2,0.3]

for num_topics in range(2, 4):
    for passes in [5, 10]:
        for alpha in alpha_values:
            for beta in beta_values:
                lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=passes, alpha=alpha, eta=beta)
                coherence_model = CoherenceModel(model=lda_model, texts=tokenized_documents, dictionary=dictionary, coherence='c_v')
                coherence_score = coherence_model.get_coherence()

                print(f"Num Topics: {num_topics}, Passes: {passes}, Alpha: {alpha}, Beta: {beta}, Coherence Score: {coherence_score}")

                # Update max coherence score and best parameters
                if coherence_score > max_coherence_score:
                    max_coherence_score = coherence_score
                    best_params = {'num_topics': num_topics, 'passes': passes, 'alpha': alpha, 'beta': beta}

print("\nBest Parameters:")
print(f"Num Topics: {best_params['num_topics']}, Passes: {best_params['passes']}, Alpha: {best_params['alpha']}, Beta: {best_params['beta']}")
print(f"Max Coherence Score: {max_coherence_score}")


Num Topics: 2, Passes: 5, Alpha: symmetric, Beta: auto, Coherence Score: 0.05620726022381424
Num Topics: 2, Passes: 5, Alpha: symmetric, Beta: 0.01, Coherence Score: 0.05620726022381423
Num Topics: 2, Passes: 5, Alpha: symmetric, Beta: 0.1, Coherence Score: 0.2210529146739672
Num Topics: 2, Passes: 5, Alpha: symmetric, Beta: 0.2, Coherence Score: 0.2528970325593752
Num Topics: 2, Passes: 5, Alpha: symmetric, Beta: 0.3, Coherence Score: 0.25846183391514665
Num Topics: 2, Passes: 5, Alpha: 0.01, Beta: auto, Coherence Score: 0.15151569193908299
Num Topics: 2, Passes: 5, Alpha: 0.01, Beta: 0.01, Coherence Score: 0.3309037726859918
Num Topics: 2, Passes: 5, Alpha: 0.01, Beta: 0.1, Coherence Score: 0.05620726022381421
Num Topics: 2, Passes: 5, Alpha: 0.01, Beta: 0.2, Coherence Score: 0.19525716331490806
Num Topics: 2, Passes: 5, Alpha: 0.01, Beta: 0.3, Coherence Score: 0.11782497886501994
Num Topics: 2, Passes: 5, Alpha: 0.1, Beta: auto, Coherence Score: 0.294496597919012
Num Topics: 2, Pass

In [14]:
# Build the LDA model
lda_model = models.LdaModel(corpus, num_topics=2,id2word=dictionary, passes=5,alpha = 0.01,eta = 0.01)

In [15]:
# Print the topics and associated words
pprint(lda_model.print_topics(num_words=5))

[(0,
  '0.045*"digital" + 0.045*"is" + 0.045*"devices" + 0.045*"the" + '
  '0.045*"smart"'),
 (1,
  '0.098*"the" + 0.098*"intelligence" + 0.098*"and" + 0.098*"revolutionizing" '
  '+ 0.098*"learning"')]


In [16]:
# Get the topic distribution for each document
for i, doc in enumerate(corpus):
    print(f"Document {i + 1}: {lda_model[doc]}")

Document 1: [(1, 0.999002)]
Document 2: [(0, 0.999002)]
Document 3: [(0, 0.99916804)]
