In [1]:
# Import necessary libraries
import spacy
import gensim
from gensim import corpora
from pprint import pprint

# Load spaCy's English NLP model
nlp = spacy.load('en_core_web_sm')


# Documents

In [2]:
# 
documents = [
    "The estate agent quickly marked out his territory on the dance floor.",
    "The hummingbird's wings blurred while it eagerly sipped the sugar water from the feeder.",
    "Jerry liked to look at paintings while eating garlic ice cream."
]

# Preprocess the documents

In [3]:

def preprocess(text):
    # Tokenize and lemmatize using spaCy
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return tokens

# Apply preprocessing to all documents
processed_documents = [preprocess(doc) for doc in documents]

# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(processed_documents)
corpus = [dictionary.doc2bow(doc) for doc in processed_documents]

# Build LDA model
lda_model1 = gensim.models.LdaModel(corpus, num_topics=4, id2word=dictionary, passes=10)
lda_model2 = gensim.models.LdaModel(corpus, num_topics=8, id2word=dictionary, passes=20)
lda_model3 = gensim.models.LdaModel(corpus, num_topics=12, id2word=dictionary, passes=10)



# Print topics and their keywords

In [4]:
pprint(lda_model1.print_topics())

[(0,
  '0.192*"learning" + 0.107*"type" + 0.106*"machine" + 0.106*"supervise" + '
  '0.106*"unsupervised" + 0.106*"main" + 0.021*"technique" + '
  '0.021*"reinforcement" + 0.021*"improve" + 0.021*"algorithm"'),
 (1,
  '0.183*"learning" + 0.127*"machine" + 0.070*"involve" + '
  '0.070*"automatically" + 0.070*"experience" + 0.070*"algorithm" + '
  '0.070*"improve" + 0.070*"reinforcement" + 0.070*"technique" + 0.070*"type"'),
 (2,
  '0.053*"machine" + 0.053*"learning" + 0.053*"type" + 0.053*"technique" + '
  '0.053*"reinforcement" + 0.053*"improve" + 0.053*"algorithm" + '
  '0.053*"experience" + 0.053*"automatically" + 0.053*"involve"'),
 (3,
  '0.164*"learning" + 0.091*"network" + 0.091*"subset" + 0.091*"deep" + '
  '0.091*"base" + 0.091*"artificial" + 0.091*"neural" + 0.091*"machine" + '
  '0.018*"type" + 0.018*"technique"')]


In [5]:
pprint(lda_model2.print_topics())

[(0,
  '0.120*"machine" + 0.120*"algorithm" + 0.120*"experience" + 0.120*"improve" '
  '+ 0.120*"involve" + 0.120*"automatically" + 0.120*"learning" + 0.013*"type" '
  '+ 0.013*"supervise" + 0.013*"main"'),
 (1,
  '0.053*"learning" + 0.053*"machine" + 0.053*"type" + 0.053*"unsupervised" + '
  '0.053*"supervise" + 0.053*"main" + 0.053*"subset" + 0.053*"network" + '
  '0.053*"neural" + 0.053*"deep"'),
 (2,
  '0.224*"learning" + 0.116*"machine" + 0.061*"network" + 0.061*"deep" + '
  '0.061*"subset" + 0.061*"artificial" + 0.061*"base" + 0.061*"neural" + '
  '0.061*"supervise" + 0.061*"main"'),
 (3,
  '0.254*"learning" + 0.134*"type" + 0.134*"machine" + 0.134*"technique" + '
  '0.134*"reinforcement" + 0.015*"subset" + 0.015*"unsupervised" + '
  '0.015*"supervise" + 0.015*"main" + 0.015*"automatically"'),
 (4,
  '0.053*"learning" + 0.053*"machine" + 0.053*"type" + 0.053*"unsupervised" + '
  '0.053*"main" + 0.053*"supervise" + 0.053*"subset" + 0.053*"deep" + '
  '0.053*"base" + 0.053*"network

In [6]:
pprint(lda_model3.print_topics())

[(0,
  '0.053*"learning" + 0.053*"type" + 0.053*"machine" + 0.053*"network" + '
  '0.053*"deep" + 0.053*"supervise" + 0.053*"technique" + '
  '0.053*"reinforcement" + 0.053*"subset" + 0.053*"algorithm"'),
 (1,
  '0.053*"learning" + 0.053*"machine" + 0.053*"type" + 0.053*"neural" + '
  '0.053*"deep" + 0.053*"supervise" + 0.053*"technique" + '
  '0.053*"reinforcement" + 0.053*"subset" + 0.053*"algorithm"'),
 (2,
  '0.053*"learning" + 0.053*"machine" + 0.053*"type" + 0.053*"network" + '
  '0.053*"deep" + 0.053*"supervise" + 0.053*"technique" + '
  '0.053*"reinforcement" + 0.053*"algorithm" + 0.053*"automatically"'),
 (3,
  '0.053*"learning" + 0.053*"machine" + 0.053*"type" + 0.053*"neural" + '
  '0.053*"deep" + 0.053*"supervise" + 0.053*"technique" + '
  '0.053*"reinforcement" + 0.053*"subset" + 0.053*"algorithm"'),
 (4,
  '0.053*"learning" + 0.053*"machine" + 0.053*"type" + 0.053*"network" + '
  '0.053*"deep" + 0.053*"supervise" + 0.053*"technique" + '
  '0.053*"reinforcement" + 0.053*"s

# Assign topics to documents

In [7]:
# Assign topics to documents
for i, doc in enumerate(processed_documents):
    print(f"Document {i+1} - Topic: {lda_model1.get_document_topics(corpus[i])}")

Document 1 - Topic: [(0, 0.031775404), (1, 0.90516657), (2, 0.031367213), (3, 0.031690866)]
Document 2 - Topic: [(0, 0.025684685), (1, 0.025802366), (2, 0.02507629), (3, 0.92343664)]
Document 3 - Topic: [(0, 0.03767094), (1, 0.88984954), (2, 0.03582462), (3, 0.036654897)]
Document 4 - Topic: [(0, 0.904013), (1, 0.03262813), (2, 0.03132827), (3, 0.032030627)]


In [8]:
for i, doc in enumerate(processed_documents):
    print(f"Document {i+1} - Topic: {lda_model2.get_document_topics(corpus[i])}")

Document 1 - Topic: [(0, 0.89058924), (1, 0.015625069), (2, 0.01564343), (3, 0.015641956), (4, 0.015625069), (5, 0.015625069), (6, 0.015625069), (7, 0.015625069)]
Document 2 - Topic: [(0, 0.012504213), (1, 0.012500087), (2, 0.9124872), (3, 0.012508115), (4, 0.012500087), (5, 0.012500087), (6, 0.012500087), (7, 0.012500087)]
Document 3 - Topic: [(0, 0.017866537), (1, 0.017857201), (2, 0.017878164), (3, 0.8749693), (4, 0.017857201), (5, 0.017857201), (6, 0.017857201), (7, 0.017857201)]
Document 4 - Topic: [(0, 0.015631838), (1, 0.015625099), (2, 0.89059466), (3, 0.01564802), (4, 0.015625099), (5, 0.015625099), (6, 0.015625099), (7, 0.015625099)]


In [9]:
for i, doc in enumerate(processed_documents):
    print(f"Document {i+1} - Topic: {lda_model3.get_document_topics(corpus[i])}")

Document 1 - Topic: [(0, 0.010416683), (1, 0.010416683), (2, 0.010416683), (3, 0.010416683), (4, 0.010416683), (5, 0.010416683), (6, 0.010416683), (7, 0.0104169445), (8, 0.010416893), (9, 0.010416982), (10, 0.88541573), (11, 0.010416683)]
Document 2 - Topic: [(8, 0.9083327)]
Document 3 - Topic: [(0, 0.011904774), (1, 0.011904774), (2, 0.011904774), (3, 0.011904774), (4, 0.011904774), (5, 0.011904774), (6, 0.011904774), (7, 0.011905126), (8, 0.011904987), (9, 0.86904675), (10, 0.011904932), (11, 0.011904774)]
Document 4 - Topic: [(0, 0.010416679), (1, 0.010416679), (2, 0.010416679), (3, 0.010416679), (4, 0.010416679), (5, 0.010416679), (6, 0.010416679), (7, 0.8854158), (8, 0.010416858), (9, 0.010417089), (10, 0.010416813), (11, 0.010416679)]


#                   