#✅ Practical: Topic Modeling using LDA on Synthetic Data

In [1]:
!pip install nltk gensim pyLDAvis

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [4]:
# Import necessary libraries
import nltk
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pyLDAvis.gensim_models
import pyLDAvis

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#📝 Step 1: Create Synthetic Text Data

In [5]:
# Synthetic document set
documents = [
    "Artificial intelligence and machine learning are transforming industries.",
    "Deep learning models are part of machine learning.",
    "Natural language processing helps machines understand human language.",
    "Football and cricket are popular sports in many countries.",
    "Athletes train daily to excel in competitions like the Olympics.",
    "Technology companies are investing in AI and robotics.",
    "Sports teams analyze performance data using analytics tools.",
    "Language models like GPT-4 are revolutionizing communication.",
    "Basketball and baseball have large fan followings.",
    "Speech recognition is a field of natural language processing."
]


#🧹 Step 2: Preprocess the Text

In [8]:
import nltk
nltk.download('punkt_tab')
# Tokenization and stopword removal
stop_words = set(stopwords.words('english'))

def preprocess(doc):
    tokens = word_tokenize(doc.lower())  # Lowercase and tokenize
    return [word for word in tokens if word.isalpha() and word not in stop_words]

# Apply preprocessing to each document
processed_docs = [preprocess(doc) for doc in documents]

# Show preprocessed documents
print(processed_docs)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...


[['artificial', 'intelligence', 'machine', 'learning', 'transforming', 'industries'], ['deep', 'learning', 'models', 'part', 'machine', 'learning'], ['natural', 'language', 'processing', 'helps', 'machines', 'understand', 'human', 'language'], ['football', 'cricket', 'popular', 'sports', 'many', 'countries'], ['athletes', 'train', 'daily', 'excel', 'competitions', 'like', 'olympics'], ['technology', 'companies', 'investing', 'ai', 'robotics'], ['sports', 'teams', 'analyze', 'performance', 'data', 'using', 'analytics', 'tools'], ['language', 'models', 'like', 'revolutionizing', 'communication'], ['basketball', 'baseball', 'large', 'fan', 'followings'], ['speech', 'recognition', 'field', 'natural', 'language', 'processing']]


[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


#📚 Step 3: Prepare Data for LDA

In [9]:
# Create a dictionary and corpus
dictionary = corpora.Dictionary(processed_docs)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Show a sample of the bag-of-words format
print("\nBag-of-Words Example (First Doc):", corpus[0])
print("\nWord Mapping (Dictionary):", dictionary.token2id)



Bag-of-Words Example (First Doc): [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]

Word Mapping (Dictionary): {'artificial': 0, 'industries': 1, 'intelligence': 2, 'learning': 3, 'machine': 4, 'transforming': 5, 'deep': 6, 'models': 7, 'part': 8, 'helps': 9, 'human': 10, 'language': 11, 'machines': 12, 'natural': 13, 'processing': 14, 'understand': 15, 'countries': 16, 'cricket': 17, 'football': 18, 'many': 19, 'popular': 20, 'sports': 21, 'athletes': 22, 'competitions': 23, 'daily': 24, 'excel': 25, 'like': 26, 'olympics': 27, 'train': 28, 'ai': 29, 'companies': 30, 'investing': 31, 'robotics': 32, 'technology': 33, 'analytics': 34, 'analyze': 35, 'data': 36, 'performance': 37, 'teams': 38, 'tools': 39, 'using': 40, 'communication': 41, 'revolutionizing': 42, 'baseball': 43, 'basketball': 44, 'fan': 45, 'followings': 46, 'large': 47, 'field': 48, 'recognition': 49, 'speech': 50}


#📊 Step 4: Train LDA Model

In [10]:
# Train LDA model
lda_model = gensim.models.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=2,  # You can try changing this
    random_state=42,
    passes=10,
    alpha='auto'
)

# Display topics
topics = lda_model.print_topics()
print("\nTopics Discovered:")
for topic in topics:
    print(topic)



Topics Discovered:
(0, '0.056*"language" + 0.056*"learning" + 0.040*"like" + 0.040*"models" + 0.040*"machine" + 0.024*"processing" + 0.024*"natural" + 0.024*"communication" + 0.024*"revolutionizing" + 0.024*"understand"')
(1, '0.049*"sports" + 0.030*"football" + 0.030*"many" + 0.030*"countries" + 0.030*"cricket" + 0.030*"popular" + 0.030*"speech" + 0.030*"field" + 0.030*"recognition" + 0.030*"natural"')


#🔍 Step 5: Explore Topics for Each Document

In [11]:
# Get topic distribution for each document
for i, row in enumerate(lda_model[corpus]):
    print(f"\nDocument {i + 1} Topic Distribution:")
    for topic_num, prop in row:
        print(f"  Topic {topic_num}: {prop:.4f}")



Document 1 Topic Distribution:
  Topic 0: 0.9793
  Topic 1: 0.0207

Document 2 Topic Distribution:
  Topic 0: 0.9793
  Topic 1: 0.0207

Document 3 Topic Distribution:
  Topic 0: 0.9843
  Topic 1: 0.0157

Document 4 Topic Distribution:
  Topic 0: 0.0291
  Topic 1: 0.9709

Document 5 Topic Distribution:
  Topic 0: 0.9821
  Topic 1: 0.0179

Document 6 Topic Distribution:
  Topic 0: 0.9754
  Topic 1: 0.0246

Document 7 Topic Distribution:
  Topic 0: 0.0221
  Topic 1: 0.9779

Document 8 Topic Distribution:
  Topic 0: 0.9754
  Topic 1: 0.0246

Document 9 Topic Distribution:
  Topic 0: 0.0346
  Topic 1: 0.9654

Document 10 Topic Distribution:
  Topic 0: 0.0295
  Topic 1: 0.9705


#📈 Step 6: Visualize Topics with pyLDAvis

In [12]:
import pyLDAvis.gensim_models as gensimvis

# Visualize topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, dictionary)
vis


#💬 Summary
LDA is used to extract hidden topics from text.

We used synthetic data to model two main themes:

One related to technology/AI/NLP

One related to sports

Visualization helps explore word-topic relationships.