<a href="https://colab.research.google.com/github/MonaFaghfouri/Topic_Modeling/blob/main/Topic_Modeling_TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade --force-reinstall numpy pandas gensim openpyxl

In [None]:
# 1. Upload Excel file
from google.colab import files
uploaded = files.upload()

# 2. Read the data
import pandas as pd
import ast

df = pd.read_excel(next(iter(uploaded)))

# 3. Evaluate token list from string format
texts = df.iloc[:, 1].astype(str).apply(ast.literal_eval).tolist()

# 4. Remove empty documents
texts = [doc for doc in texts if len(doc) > 0]

# 5. Create dictionary and corpus
from gensim import corpora, models

dictionary = corpora.Dictionary(texts)
print(f"Unique tokens before filtering: {len(dictionary)}")

dictionary.filter_extremes(no_below=2, no_above=0.9)
print(f"Unique tokens after filtering: {len(dictionary)}")

if len(dictionary) == 0:
    raise ValueError("Dictionary is empty after filtering. Adjust thresholds.")

bow_corpus = [dictionary.doc2bow(text) for text in texts]

# 6. Apply TF-IDF transformation
tfidf_model = models.TfidfModel(bow_corpus)
tfidf_corpus = tfidf_model[bow_corpus]

# 7. Optimize LDA with TF-IDF corpus
from gensim.models import LdaModel, CoherenceModel
import matplotlib.pyplot as plt

def optimize_lda_tfidf(dictionary, tfidf_corpus, texts, topic_range=(5, 16), max_words=20):
    best_model = None
    best_topic_num = 0
    best_num_words = 0
    best_coherence = float('-inf')
    best_perplexity = float('inf')
    best_combination = None
    results = []

    for num_topics in range(*topic_range):
        print(f"\n🔄 Testing {num_topics} topics...")
        model = LdaModel(
            corpus=tfidf_corpus,
            id2word=dictionary,
            num_topics=num_topics,
            passes=30,
            iterations=600,
            random_state=42,
            alpha='auto',
            eta='auto',
            eval_every=None
        )

        for num_words in range(5, max_words + 1):
            topics = model.show_topics(num_topics=-1, num_words=num_words, formatted=False)
            topic_word_lists = [[word for word, _ in topic[1]] for topic in topics]

            cm = CoherenceModel(
                topics=topic_word_lists,
                texts=texts,
                dictionary=dictionary,
                coherence='c_v'
            )
            coherence = cm.get_coherence()

            # Synthetic corpus for perplexity
            synthetic_corpus = []
            for topic in topics:
                words = [word for word, _ in topic[1]]
                bow = dictionary.doc2bow(words)
                tfidf_bow = tfidf_model[bow]
                synthetic_corpus.append(tfidf_bow)
            perplexity = model.log_perplexity(synthetic_corpus)

            results.append((num_topics, num_words, coherence, perplexity))
            print(f"Topics: {num_topics} | Words: {num_words} → Coherence: {coherence:.4f} | Perplexity: {perplexity:.4f}")

            if coherence > best_coherence or (coherence == best_coherence and perplexity < best_perplexity):
                best_model = model
                best_topic_num = num_topics
                best_num_words = num_words
                best_coherence = coherence
                best_perplexity = perplexity
                best_combination = (num_topics, num_words)

    return best_model, best_combination, best_coherence, best_perplexity, results

# 8. Run optimization
print("\n🔍 Finding best topic/word configuration (TF-IDF)...")
best_model, best_combo, best_coh, best_perp, all_results = optimize_lda_tfidf(
    dictionary, tfidf_corpus, texts, topic_range=(5, 16), max_words=20
)

# 9. Print best configuration
print(f"\n✅ Best Model: {best_combo[0]} Topics | {best_combo[1]} Words per Topic")
print(f"   Coherence: {best_coh:.4f} | Perplexity: {best_perp:.4f}")

# 10. Show best topics
print("\n🧠 Best Topics (TF-IDF):")
topics = best_model.show_topics(num_topics=-1, num_words=best_combo[1], formatted=False)
for topic_id, words in topics:
    word_list = ", ".join(word for word, _ in words)
    print(f"Topic {topic_id}: {word_list}")


In [None]:
# 1. Upload files
from google.colab import files
uploaded = files.upload()

# 2. Libraries
import pandas as pd
import ast
import re
import gensim
from gensim import corpora
from gensim.models import CoherenceModel, LdaModel
import matplotlib.pyplot as plt
import numpy as np

# 3. Identify input files
file_names = list(uploaded.keys())
sample_words_file = [f for f in file_names if 'word' in f.lower()][0]
text_file = [f for f in file_names if f != sample_words_file][0]

# 4. Word cleaning function
def clean_word(word):
    return re.sub(r'[^\w\s]', '', word.strip().lower())

# 5. Read list of economic words
df_words = pd.read_excel(sample_words_file)
economic_words = df_words.iloc[:, 0].dropna().astype(str).apply(clean_word).tolist()
economic_words = set(economic_words)

# 6. Process text file
df_texts = pd.read_excel(text_file)
texts_raw = df_texts.iloc[:, 1]

cleaned_docs = []
for val in texts_raw:
    try:
        parsed = ast.literal_eval(str(val))
        if isinstance(parsed, list):
            cleaned = [clean_word(w) for w in parsed if clean_word(w) in economic_words]
            if cleaned:
                cleaned_docs.append(cleaned)
    except:
        continue

print(f"📊 Number of documents containing economic words: {len(cleaned_docs)}")
if not cleaned_docs:
    raise ValueError("❌ No documents contain economic words.")

# 7. Create dictionary and corpus
dictionary = corpora.Dictionary(cleaned_docs)
corpus = [dictionary.doc2bow(text) for text in cleaned_docs]

# 8. Find best number of topics based on coherence score
print("🔍 Finding best number of topics based on coherence...")
best_model = None
best_coherence = -1
best_topics = 0

for num_topics in range(5, 21):  # Evaluate topics from 5 to 20
    lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=10, random_state=42)
    coherence_model = CoherenceModel(model=lda, texts=cleaned_docs, dictionary=dictionary, coherence='c_v')
    coherence = coherence_model.get_coherence()

    print(f"→ Topics: {num_topics}, Coherence: {coherence:.4f}")

    if coherence > best_coherence:
        best_coherence = coherence
        best_model = lda
        best_topics = num_topics

# 9. Display final model output
lda_model = best_model
perplexity_score = lda_model.log_perplexity(corpus)

print(f"\n✅ Best model found with {best_topics} topics.")
print(f"📈 Coherence Score (c_v): {best_coherence:.4f}")
print(f"📉 Perplexity: {perplexity_score:.4f}")

# 10. Print final topics
num_words = 15
print(f"\n🧠 Final Topics (Top {num_words} words per topic):")
for idx, topic in lda_model.print_topics(num_topics=best_topics, num_words=num_words):
    words = re.findall(r'"(.*?)"', topic)
    if len(words) >= 2:
        print(f"Topic {idx}: {', '.join(words)}")
