# LDA DEMO

In [1]:
# Import necessary libraries
import spacy
import gensim
from gensim import corpora
from pprint import pprint

# Load spaCy's English NLP model
nlp = spacy.load('en_core_web_sm')


# Sample documents for demonstration

In [2]:
# Sample Document
old_documents = [
    "Natural language processing is a subfield of artificial intelligence.",
    "Latent Dirichlet Allocation is a generative probabilistic model.",
    "Topic modeling is used to identify topics present in a corpus of text.",
    "Gensim is a popular Python library for topic modeling and document similarity."
]

# New Sample Documents
documents = [
    "Artificial intelligence is reshaping industries and revolutionizing technology.",
    "Unsupervised learning methods, such as clustering, play a pivotal role in data analysis.",
    "The intersection of data science and business strategy is driving innovation.",
    "Natural language processing enables machines to understand and interpret human language."
]

# Preprocess the documents

In [3]:

def preprocess(text):
    # Tokenize and lemmatize using spaCy
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return tokens

# Apply preprocessing to all documents
processed_documents = [preprocess(doc) for doc in documents]

# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(processed_documents)
corpus = [dictionary.doc2bow(doc) for doc in processed_documents]

# Build LDA model (3, 5, 7)
lda_model_3 = gensim.models.LdaModel(corpus, num_topics=3, id2word=dictionary, passes=15)
lda_model_5 = gensim.models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)
lda_model_7 = gensim.models.LdaModel(corpus, num_topics=7, id2word=dictionary, passes=15)

# Print topics and their keywords

In [4]:
# 3 Topics
print('3 Topics')
pprint(lda_model_3.print_topics())

3 Topics
[(0,
  '0.051*"method" + 0.051*"role" + 0.051*"pivotal" + 0.051*"analysis" + '
  '0.051*"data" + 0.051*"clustering" + 0.051*"play" + 0.051*"learning" + '
  '0.051*"unsupervised" + 0.051*"datum"'),
 (1,
  '0.083*"technology" + 0.083*"intelligence" + 0.083*"industry" + '
  '0.083*"artificial" + 0.083*"revolutionize" + 0.083*"reshape" + '
  '0.021*"language" + 0.021*"strategy" + 0.021*"drive" + 0.021*"science"'),
 (2,
  '0.123*"language" + 0.070*"enable" + 0.070*"understand" + 0.070*"natural" + '
  '0.070*"machine" + 0.070*"interpret" + 0.070*"processing" + 0.070*"human" + '
  '0.018*"strategy" + 0.018*"drive"')]


In [5]:
# 5 Topics
print('5 Topics')
pprint(lda_model_5.print_topics())

5 Topics
[(0,
  '0.147*"language" + 0.080*"human" + 0.080*"processing" + 0.080*"natural" + '
  '0.080*"understand" + 0.080*"enable" + 0.080*"machine" + 0.080*"interpret" + '
  '0.013*"industry" + 0.013*"reshape"'),
 (1,
  '0.080*"play" + 0.080*"method" + 0.080*"role" + 0.080*"clustering" + '
  '0.080*"pivotal" + 0.080*"data" + 0.080*"learning" + 0.080*"analysis" + '
  '0.080*"unsupervised" + 0.013*"artificial"'),
 (2,
  '0.100*"intelligence" + 0.100*"artificial" + 0.100*"technology" + '
  '0.100*"revolutionize" + 0.100*"industry" + 0.100*"reshape" + '
  '0.017*"strategy" + 0.017*"science" + 0.017*"datum" + 0.017*"business"'),
 (3,
  '0.033*"industry" + 0.033*"reshape" + 0.033*"technology" + '
  '0.033*"revolutionize" + 0.033*"intelligence" + 0.033*"artificial" + '
  '0.033*"science" + 0.033*"business" + 0.033*"strategy" + 0.033*"datum"'),
 (4,
  '0.092*"datum" + 0.092*"intersection" + 0.092*"drive" + 0.092*"innovation" + '
  '0.092*"strategy" + 0.092*"business" + 0.092*"science" + 0.01

In [6]:
# 7 Topics
print('7 Topics')
pprint(lda_model_7.print_topics())

7 Topics
[(0,
  '0.033*"technology" + 0.033*"artificial" + 0.033*"datum" + 0.033*"reshape" + '
  '0.033*"intelligence" + 0.033*"intersection" + 0.033*"strategy" + '
  '0.033*"science" + 0.033*"industry" + 0.033*"interpret"'),
 (1,
  '0.161*"language" + 0.086*"enable" + 0.086*"natural" + 0.086*"human" + '
  '0.086*"interpret" + 0.086*"processing" + 0.086*"machine" + '
  '0.086*"understand" + 0.011*"industry" + 0.011*"intelligence"'),
 (2,
  '0.033*"reshape" + 0.033*"industry" + 0.033*"intersection" + '
  '0.033*"revolutionize" + 0.033*"technology" + 0.033*"artificial" + '
  '0.033*"datum" + 0.033*"language" + 0.033*"science" + 0.033*"innovation"'),
 (3,
  '0.033*"reshape" + 0.033*"science" + 0.033*"industry" + 0.033*"language" + '
  '0.033*"intelligence" + 0.033*"artificial" + 0.033*"drive" + '
  '0.033*"innovation" + 0.033*"datum" + 0.033*"interpret"'),
 (4,
  '0.086*"method" + 0.086*"role" + 0.086*"analysis" + 0.086*"clustering" + '
  '0.086*"play" + 0.086*"pivotal" + 0.086*"data" + 0

# Assign topics to documents

In [7]:
# Assign topics to documents - 3 Topics
for i, doc in enumerate(processed_documents):
    print(f"Document {i+1} - Topic: {lda_model_3.get_document_topics(corpus[i])}")

Document 1 - Topic: [(0, 0.047816664), (1, 0.9042901), (2, 0.047893275)]
Document 2 - Topic: [(0, 0.9326373), (1, 0.033713274), (2, 0.033649497)]
Document 3 - Topic: [(0, 0.91579074), (1, 0.042144798), (2, 0.04206447)]
Document 4 - Topic: [(0, 0.033477753), (1, 0.033574242), (2, 0.932948)]


In [8]:
# Assign topics to documents - 5 Topics
for i, doc in enumerate(processed_documents):
    print(f"Document {i+1} - Topic: {lda_model_5.get_document_topics(corpus[i])}")

Document 1 - Topic: [(0, 0.02857554), (1, 0.02857554), (2, 0.88569045), (3, 0.028582249), (4, 0.02857619)]
Document 2 - Topic: [(0, 0.020003576), (1, 0.91997826), (2, 0.0200045), (3, 0.020009542), (4, 0.02000414)]
Document 3 - Topic: [(0, 0.025003882), (1, 0.025003882), (2, 0.025004886), (3, 0.025010284), (4, 0.8999771)]
Document 4 - Topic: [(0, 0.91998106), (1, 0.020003125), (2, 0.020003935), (3, 0.02000827), (4, 0.02000362)]


In [9]:
# Assign topics to documents - 7 Topics
for i, doc in enumerate(processed_documents):
    print(f"Document {i+1} - Topic: {lda_model_7.get_document_topics(corpus[i])}")

Document 1 - Topic: [(0, 0.020408424), (1, 0.020408265), (2, 0.020408424), (3, 0.020408424), (4, 0.020408265), (5, 0.020408276), (6, 0.87754995)]
Document 2 - Topic: [(0, 0.01428595), (1, 0.0142858075), (2, 0.01428595), (3, 0.01428595), (4, 0.9142847), (5, 0.01428582), (6, 0.014285827)]
Document 3 - Topic: [(0, 0.017857391), (1, 0.017857239), (2, 0.017857391), (3, 0.017857391), (4, 0.017857239), (5, 0.8928561), (6, 0.01785726)]
Document 4 - Topic: [(0, 0.014285918), (1, 0.9142848), (2, 0.014285918), (3, 0.014285918), (4, 0.014285794), (5, 0.014285805), (6, 0.01428581)]


#                   

# Mini Exercise

Instructions:

Use the provided Python code to perform topic modeling on a set of sample documents.
Modify the sample documents or add your own to see how the results change.
Experiment with the number of topics (parameter: num_topics) in the LDA model. Observe how different numbers of topics impact the result

Make a small insight on what you have observe when you change, increase, or decrease some parameters.(Short Essay)

Varying the number of topics (num_topics) in the Latent Dirichlet Allocation (LDA) model experimentation for topic modeling yielded intriguing findings. The model appeared to be able to pick up on subtleties and finer details in the documents as the number of topics increased, which resulted in more focused themes. However, reducing the number of topics resulted in broader and more generalized categories. Finding a balance was important since too few topics oversimplified the representation and too many topics ran the danger of overfitting. This subtle tweaking of the parameters made it possible to use atailored approach in uncovering the latent structure in the text data, highlighting the significance of parameter tuning in maximizing the efficiency of topic modeling algorithms.