<a href="https://colab.research.google.com/github/MonaFaghfouri/Topic_Modeling/blob/main/Topic_Modeling_LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install compatible versions of required libraries
!pip install --upgrade --force-reinstall numpy==1.23.5 pandas==1.5.3 gensim openpyxl -q

In [None]:
# 1. Upload Excel file
from google.colab import files
uploaded = files.upload()

# 2. Read the data
import pandas as pd
import ast

df = pd.read_excel(next(iter(uploaded)))

# 3. Evaluate token list from string format
texts = df.iloc[:, 1].astype(str).apply(ast.literal_eval).tolist()

# 4. Remove empty documents
texts = [doc for doc in texts if len(doc) > 0]

# 5. Create dictionary and corpus
from gensim import corpora

dictionary = corpora.Dictionary(texts)
print(f"Unique tokens before filtering: {len(dictionary)}")

dictionary.filter_extremes(no_below=2, no_above=0.9)
print(f"Unique tokens after filtering: {len(dictionary)}")

if len(dictionary) == 0:
    raise ValueError("Dictionary is empty after filtering. Adjust thresholds.")

corpus = [dictionary.doc2bow(text) for text in texts]

# 6. Optimize model: topic count and num_words together
from gensim.models import LdaModel, CoherenceModel
import matplotlib.pyplot as plt

def optimize_lda_model(dictionary, corpus, texts, topic_range=(5, 16), max_words=20):
    best_model = None
    best_topic_num = 0
    best_num_words = 0
    best_coherence = float('-inf')
    best_perplexity = float('inf')
    best_combination = None
    results = []

    for num_topics in range(*topic_range):
        print(f"\n🔄 Testing {num_topics} topics...")
        model = LdaModel(
            corpus=corpus,
            id2word=dictionary,
            num_topics=num_topics,
            passes=30,
            iterations=600,
            random_state=42,
            alpha='auto',
            eta='auto',
            eval_every=None
        )

        for num_words in range(5, max_words + 1):
            topics = model.show_topics(num_topics=-1, num_words=num_words, formatted=False)
            topic_word_lists = [[word for word, _ in topic[1]] for topic in topics]

            cm = CoherenceModel(
                topics=topic_word_lists,
                texts=texts,
                dictionary=dictionary,
                coherence='c_v'
            )
            coherence = cm.get_coherence()

            # Synthetic corpus for perplexity
            synthetic_corpus = []
            for topic in topics:
                words = [word for word, _ in topic[1]]
                bow = dictionary.doc2bow(words)
                synthetic_corpus.append(bow)
            perplexity = model.log_perplexity(synthetic_corpus)

            results.append((num_topics, num_words, coherence, perplexity))
            print(f"Topics: {num_topics} | Words: {num_words} → Coherence: {coherence:.4f} | Perplexity: {perplexity:.4f}")

            if coherence > best_coherence or (coherence == best_coherence and perplexity < best_perplexity):
                best_model = model
                best_topic_num = num_topics
                best_num_words = num_words
                best_coherence = coherence
                best_perplexity = perplexity
                best_combination = (num_topics, num_words)

    return best_model, best_combination, best_coherence, best_perplexity, results

# 7. Run optimization
print("\n🔍 Finding best topic/word configuration...")
best_model, best_combo, best_coh, best_perp, all_results = optimize_lda_model(
    dictionary, corpus, texts, topic_range=(5, 16), max_words=20
)

# 8. Print best configuration
print(f"\n✅ Best Model: {best_combo[0]} Topics | {best_combo[1]} Words per Topic")
print(f"   Coherence: {best_coh:.4f} | Perplexity: {best_perp:.4f}")

# 9. Show best topics
print("\n🧠 Best Topics (based on optimal settings):")
topics = best_model.show_topics(num_topics=-1, num_words=best_combo[1], formatted=False)
for topic_id, words in topics:
    word_list = ", ".join(word for word, _ in words)
    print(f"Topic {topic_id}: {word_list}")


In [None]:
# 1. Upload files
from google.colab import files
uploaded = files.upload()

# 2. Libraries
import pandas as pd
import ast
import re
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel

# 3. Identify filtered word list file and text file
file_names = list(uploaded.keys())
sample_words_file = [f for f in file_names if 'word' in f.lower()][0]
text_file = [f for f in file_names if f != sample_words_file][0]

# 4. Word cleaning function
def clean_word(word):
    return re.sub(r'[^\w\s]', '', word.strip().lower())

# 5. Read and clean filtered words
df_words = pd.read_excel(sample_words_file)
economic_words = df_words.iloc[:, 0].dropna().astype(str).apply(clean_word).tolist()
economic_words = set(economic_words)

# 6. Read and process text file
df_texts = pd.read_excel(text_file)
texts_raw = df_texts.iloc[:, 1]

texts = []
for val in texts_raw:
    try:
        parsed = ast.literal_eval(str(val))
        if isinstance(parsed, list):
            cleaned = [clean_word(w) for w in parsed if clean_word(w) in economic_words]
            if cleaned:
                texts.append(cleaned)
    except:
        continue

print(f"📊 Number of documents containing filtered words: {len(texts)}")
if not texts:
    raise ValueError("❌ No documents contain the filtered words.")

# 7. Create dictionary and corpus
dictionary = corpora.Dictionary(texts)
print(f"🔢 Initial token count: {len(dictionary)}")
dictionary.filter_extremes(no_below=2, no_above=0.9)
print(f"🔢 Token count after filtering: {len(dictionary)}")

if len(dictionary) == 0:
    raise ValueError("❌ Dictionary is empty after filtering.")

corpus = [dictionary.doc2bow(text) for text in texts]

# 8. Optimize model based on Coherence and Perplexity
def optimize_lda_model(dictionary, corpus, texts, topic_range=(5, 16), max_words=20):
    best_model = None
    best_topic_num = 0
    best_num_words = 0
    best_coherence = float('-inf')
    best_perplexity = float('inf')
    best_combination = None
    results = []

    for num_topics in range(*topic_range):
        print(f"\n🔄 Evaluating {num_topics} topics...")
        model = LdaModel(
            corpus=corpus,
            id2word=dictionary,
            num_topics=num_topics,
            passes=30,
            iterations=600,
            random_state=42,
            alpha='auto',
            eta='auto',
            eval_every=None
        )

        for num_words in range(5, max_words + 1):
            topics = model.show_topics(num_topics=-1, num_words=num_words, formatted=False)
            topic_word_lists = [[word for word, _ in topic[1]] for topic in topics]

            cm = CoherenceModel(
                topics=topic_word_lists,
                texts=texts,
                dictionary=dictionary,
                coherence='c_v'
            )
            coherence = cm.get_coherence()

            synthetic_corpus = [
                dictionary.doc2bow([word for word, _ in topic[1]])
                for topic in topics
            ]
            perplexity = model.log_perplexity(synthetic_corpus)

            results.append((num_topics, num_words, coherence, perplexity))
            print(f"Topics: {num_topics} | Words: {num_words} → Coherence: {coherence:.4f} | Perplexity: {perplexity:.4f}")

            if coherence > best_coherence or (coherence == best_coherence and perplexity < best_perplexity):
                best_model = model
                best_topic_num = num_topics
                best_num_words = num_words
                best_coherence = coherence
                best_perplexity = perplexity
                best_combination = (num_topics, num_words)

    return best_model, best_combination, best_coherence, best_perplexity, results

# 9. Run optimization
print("\n🔍 Finding the best topic-word configuration...")
best_model, best_combo, best_coh, best_perp, all_results = optimize_lda_model(
    dictionary, corpus, texts, topic_range=(5, 16), max_words=20
)

# 10. Final results
print(f"\n✅ Best model: {best_combo[0]} topics | {best_combo[1]} words per topic")
print(f"   📈 Coherence: {best_coh:.4f}")
print(f"   📉 Perplexity: {best_perp:.4f}")

print("\n🧠 Final topics based only on filtered words:")
topics = best_model.show_topics(num_topics=-1, num_words=best_combo[1], formatted=False)
for topic_id, words in topics:
    word_list = ", ".join(word for word, _ in words)
    print(f"Topic {topic_id}: {word_list}")
