In [1]:
import os  # For operating system dependent functionality
import time  # For timing code execution
import re  # For regular expressions

import pandas as pd  # For data manipulation and analysis
import nltk  # For natural language processing tasks
from nltk.corpus import stopwords  # For stopwords
from nltk.stem import WordNetLemmatizer  # For word lemmatization

# For Gensim models and functionalities
from gensim.models.phrases import Phrases, Phraser
from gensim.models import LdaModel, TfidfModel
from gensim.corpora import Dictionary

import spacy  # For advanced natural language processing

from gensim.models.coherencemodel import CoherenceModel
import numpy as np

  from pandas.core import (


In [2]:
# Define your directory and file name
directory = "D:/2_nlp"
file_name = "documents_with_bigrams.xlsx"
full_path = os.path.join(directory, file_name)

start_time = time.time()
# Load the Excel file
df_loaded = pd.read_excel(full_path)
end_time = time.time()
# Now, documents_with_bigrams_loaded contains your original list of lists structure
print("The total time taken in mins is {}".format(round((end_time - start_time) / 60)))

The total time taken in mins is 1


In [5]:
seed_topics_1 = {
    "Education & Learning": ["school", "student", "education", "university", "class", "teacher", "study", "college", "program", "course"],
    "Urban Development & Infrastructure": ["city", "building", "street", "construction", "project", "development", "area", "infrastructure", "transport", "planning"],
    "Health & Wellness": ["health", "hospital", "care", "medical", "life", "service", "well", "program", "condition", "treatment"],
    "Economic & Business Trends": ["market", "business", "company", "industry", "economic", "price", "sale", "trade", "investment", "growth"],
    "Environmental Awareness & Sustainability": ["environment", "water", "energy", "sustainable", "climate", "resource", "conservation", "green", "pollution", "recycling"]
}

seed_topics_2 = {
    "Cultural & Social Life": ["community", "cultural", "society", "event", "tradition", "art", "music", "festival", "celebration", "history"],
    "Technology & Innovation": ["development", "technology", "project", "research", "innovation", "system", "design", "engineering", "digital", "software"],
    "Public Services & Administration": ["service", "public", "administration", "government", "office", "policy", "management", "official", "department", "agency"],
    "Media & Communication": ["news", "media", "communication", "information", "broadcast", "press", "publication", "report", "journalism", "social_media"],
    "Legal & Justice System": ["law", "legal", "court", "justice", "case", "judge", "policy", "rights", "attorney", "sentence"]
}

seed_topics_3 = {
    "Health & Wellness": ["hospital", "health", "care", "medical", "treatment", "doctor", "nurse", "wellness", "disease", "therapy"],
    "Education & Learning": ["school", "education", "university", "student", "teacher", "class", "learning", "course", "study", "research"],
    "Environment & Nature": ["environment", "nature", "water", "plant", "animal", "conservation", "climate", "green", "earth", "sustainability"],
    "Economy & Business": ["business", "economy", "market", "company", "industry", "trade", "finance", "investment", "growth", "development"],
    "Arts & Entertainment": ["art", "music", "film", "book", "theater", "gallery", "entertainment", "performance", "artist", "exhibition"]
}
seed_topics_4 = {
    "Technology & Innovation": ["system", "technology", "data", "research", "development", "computer", "digital", "software", "network", "innovation"],
    "Travel & Tourism": ["travel", "tourism", "hotel", "trip", "visit", "destination", "journey", "tourist", "guide", "vacation"],
    "Food & Cuisine": ["food", "restaurant", "meal", "cook", "chef", "cuisine", "dining", "taste", "dish", "eat"],
    "Public Services & Infrastructure": ["service", "public", "community", "facility", "transport", "infrastructure", "project", "support", "management", "program"],
    "Science & Exploration": ["science", "research", "study", "space", "earth", "exploration", "scientific", "experiment", "project", "discovery"]
}

seed_topics_5 = {
    "Cultural & Social Issues": ["society", "culture", "community", "issue", "social", "public", "national", "discussion", "identity", "values"],
    "Recreation & Leisure": ["park", "game", "sport", "leisure", "recreation", "play", "hobby", "outdoor", "activity", "entertainment"],
    "Law & Justice": ["law", "justice", "legal", "court", "case", "judge", "right", "policy", "regulation", "authority"],
    "Financial Markets": ["market", "finance", "economy", "investment", "stock", "bank", "trade", "financial", "currency", "capital"],
    "Human Rights & Equality": ["right", "equality", "freedom", "social", "issue", "justice", "law", "policy", "human", "community"]
}
total_seed_topics = [seed_topics_5, seed_topics_4, seed_topics_3, seed_topics_2, seed_topics_1]

In [6]:
def train_lda_model(documents, num_topics, no_below, no_above, total_passes, random_state, low_value):
    """
    Trains an LDA model using documents that have been preprocessed, including phrase detection and NER.
    
    Parameters:
    - documents: List of preprocessed documents, each represented as a list of tokens.
    - num_topics: The desired number of topics.
    - no_below: Filter out tokens that appear in fewer than 'no_below' documents.
    - no_above: Filter out tokens that appear in more than 'no_above' proportion of documents.
    - total_passes: Number of passes through the corpus during training.
    - random_state: Seed for reproducibility.
    
    Returns:
    - lda_model: The trained LDA model.
    - dictionary: Gensim dictionary created from the documents.
    - corpus: Document-term matrix used for LDA training.
    """
    
    # Create a dictionary and filter extremes
    dictionary = Dictionary(documents)
    dictionary.filter_extremes(no_below=no_below, no_above=no_above)
    
    # Create the Document-Term Matrix
    corpus = [dictionary.doc2bow(doc) for doc in documents]
    
    # Apply TF-IDF filtering
    tfidf = TfidfModel(corpus)
    tfidf_corpus = [[(id, freq) for id, freq in doc if tfidf.idfs[id] > low_value] for doc in corpus]
    
    # Initialize and train the LDA model
    lda_model = LdaModel(corpus=tfidf_corpus, num_topics=num_topics, id2word=dictionary,
                         passes=total_passes, random_state=random_state)
    
    return lda_model, dictionary, tfidf_corpus

In [7]:
def emphasize_seed_words(document, seed_words, factor):
    """
    Duplicate seed words in the document to make them more influential by a specified factor.
    """
    emphasized_document = []
    for word in document:
        emphasized_document.append(word)
        if word in seed_words:
            emphasized_document.extend([word] * (factor - 1))  # Duplicate seed words by the factor
    return emphasized_document

# Update your document preparation to include seed word emphasis
def prepare_documents(df, seed_topics, factor):
    # Ensure all entries are strings and handle missing values
    documents = df['documents'].fillna('').astype(str).str.split().tolist()
    seed_words = [word for topic_words in seed_topics.values() for word in topic_words]
    emphasized_documents = [emphasize_seed_words(doc, seed_words, factor) for doc in documents]
    return emphasized_documents

# Incorporate this step before dictionary and corpus creation in your LDA model building function
for seed_topics in total_seed_topics:

    factor = 5
    seeded_documents = prepare_documents(df_loaded, seed_topics, 5)
    # Set fixed hyperparameters
    num_topics = 5
    total_passes = 3

    # Define the range of hyperparameters to explore
    no_below = 15  # Example: Minimum document frequency
    no_above = 0.2  # Example: Maximum document frequency proportion
    low_value = 0.1  # TF-IDF low value cut-off

    start_time_iter = time.time()  # Start time for this iteration

    # Train the LDA model with the current set of hyperparameters

    lda_model, dictionary, tfidf_corpus = train_lda_model(seeded_documents, num_topics=num_topics, no_below=no_below, no_above=no_above, total_passes=total_passes, random_state=100, low_value=low_value)

    # Calculate Coherence Score using c_v measure
    coherence_model_lda = CoherenceModel(model=lda_model, texts=seeded_documents, dictionary=dictionary, coherence='c_v')
    coherence_lda_cv = coherence_model_lda.get_coherence()

    end_time_iter = time.time()  # End time for this iteration
    iter_duration = round((end_time_iter - start_time_iter) / 60)

    print(f"no_below: {no_below}, no_above: {no_above}, low_value: {low_value}, Coherence: {coherence_lda_cv}, Time: {iter_duration} minutes")

    # Print topics for the current model
    print("Topics for the current model:")
    for idx, topic in lda_model.print_topics(-1):
        print(f"Topic: {idx} \nWords: {topic}\n")
    print("-"*20)

no_below: 15, no_above: 0.2, low_value: 0.1, Coherence: 0.4840537814924571, Time: 25 minutes
Topics for the current model:
Topic: 0 
Words: 0.060*"game" + 0.039*"play" + 0.012*"sport" + 0.009*"national" + 0.008*"team" + 0.007*"two" + 0.007*"first" + 0.006*"new_york" + 0.006*"today" + 0.005*"park"

Topic: 1 
Words: 0.018*"stock" + 0.012*"market" + 0.010*"park" + 0.009*"room" + 0.007*"new" + 0.007*"national" + 0.007*"car" + 0.006*"trade" + 0.006*"lot" + 0.006*"sale"

Topic: 2 
Words: 0.011*"right" + 0.011*"one" + 0.006*"time" + 0.005*"said" + 0.004*"day" + 0.004*"man" + 0.004*"would" + 0.004*"many" + 0.004*"make" + 0.004*"two"

Topic: 3 
Words: 0.009*"home" + 0.007*"society" + 0.007*"john" + 0.006*"park" + 0.005*"church" + 0.005*"school" + 0.005*"member" + 0.005*"william" + 0.005*"street" + 0.005*"son"

Topic: 4 
Words: 0.015*"court" + 0.014*"case" + 0.012*"public" + 0.010*"law" + 0.010*"national" + 0.009*"said" + 0.007*"bank" + 0.007*"state" + 0.006*"issue" + 0.006*"judge"

------------

In [8]:
seed_topics = {
    "Sports": ["game", "team", "season", "play", "club", "win", "match", "score", "player", "coach"],
    "Rentals & Real Estate": ["house", "home", "room", "property", "rent", "estate", "apartment", "building", "lease", "mortgage"],
    "Philosophy & Thought": ["time", "life", "man", "world", "philosophy", "thought", "mind", "idea", "reason", "belief"],
    "Community Gatherings/Events": ["church", "school", "event", "member", "community", "meeting", "ceremony", "celebration", "gathering", "festival"],
    "Politics/Government": ["president", "state", "government", "senate", "congress", "election", "policy", "law", "political", "diplomacy"]
}
factor_list = [5,10,15]
for factor in factor_list:
    seeded_documents = prepare_documents(df_loaded, seed_topics, factor)
    # Set fixed hyperparameters
    num_topics = 5
    total_passes = 3

    # Define the range of hyperparameters to explore
    no_below = 15  # Example: Minimum document frequency
    no_above = 0.2  # Example: Maximum document frequency proportion
    low_value = 0.1  # TF-IDF low value cut-off

    start_time_iter = time.time()  # Start time for this iteration

    # Train the LDA model with the current set of hyperparameters

    lda_model, dictionary, tfidf_corpus = train_lda_model(seeded_documents, num_topics=num_topics, no_below=no_below, no_above=no_above, total_passes=total_passes, random_state=100, low_value=low_value)

    # Calculate Coherence Score using c_v measure
    coherence_model_lda = CoherenceModel(model=lda_model, texts=seeded_documents, dictionary=dictionary, coherence='c_v')
    coherence_lda_cv = coherence_model_lda.get_coherence()

    end_time_iter = time.time()  # End time for this iteration
    iter_duration = round((end_time_iter - start_time_iter) / 60)

    print(f"factor: {factor}, no_below: {no_below}, no_above: {no_above}, low_value: {low_value}, Coherence: {coherence_lda_cv}, Time: {iter_duration} minutes")

    # Print topics for the current model
    print("Topics for the current model:")
    for idx, topic in lda_model.print_topics(-1):
        print(f"Topic: {idx} \nWords: {topic}\n")
    print("-"*20)

factor: 5, no_below: 15, no_above: 0.2, low_value: 0.1, Coherence: 0.4784201278974032, Time: 26 minutes
Topics for the current model:
Topic: 0 
Words: 0.048*"state" + 0.017*"president" + 0.016*"government" + 0.011*"law" + 0.009*"house" + 0.008*"said" + 0.008*"time" + 0.008*"congress" + 0.006*"election" + 0.006*"would"

Topic: 1 
Words: 0.055*"room" + 0.049*"house" + 0.034*"home" + 0.009*"rent" + 0.008*"apartment" + 0.007*"new" + 0.006*"property" + 0.006*"building" + 0.006*"car" + 0.005*"lot"

Topic: 2 
Words: 0.034*"time" + 0.021*"man" + 0.012*"life" + 0.011*"world" + 0.010*"one" + 0.005*"thought" + 0.004*"said" + 0.004*"day" + 0.004*"play" + 0.004*"two"

Topic: 3 
Words: 0.050*"home" + 0.031*"game" + 0.021*"team" + 0.014*"play" + 0.013*"win" + 0.011*"season" + 0.008*"score" + 0.007*"player" + 0.007*"match" + 0.006*"coach"

Topic: 4 
Words: 0.053*"club" + 0.049*"school" + 0.048*"member" + 0.039*"meeting" + 0.039*"church" + 0.023*"president" + 0.017*"building" + 0.010*"community" + 0.00

In [9]:
seed_topics = {
    "Sports": ["game", "team", "season", "play", "club", "win", "match", "score", "player", "coach"],
    "Rentals & Real Estate": ["house", "home", "room", "property", "rent", "estate", "apartment", "lease", "mortgage"],
    "Philosophy & Thought": ["time", "life", "man", "world", "philosophy", "thought", "mind", "idea", "reason", "belief"],
    "Community Gatherings/Events": ["church", "school", "member", "community", "meeting", "ceremony", "celebration", "gathering", "festival"],
    "Politics/Government": ["president", "state", "government", "senate", "congress", "election", "policy", "law", "political", "diplomacy"]
}

factor = 10
seeded_documents = prepare_documents(df_loaded, seed_topics, factor)
# Set fixed hyperparameters
num_topics = 5
total_passes = 3

# Define the range of hyperparameters to explore
no_below = 15  # Example: Minimum document frequency
no_above = 0.2  # Example: Maximum document frequency proportion
low_value = 0.1  # TF-IDF low value cut-off

start_time_iter = time.time()  # Start time for this iteration

# Train the LDA model with the current set of hyperparameters

lda_model, dictionary, tfidf_corpus = train_lda_model(seeded_documents, num_topics=num_topics, no_below=no_below, no_above=no_above, total_passes=total_passes, random_state=100, low_value=low_value)

# Calculate Coherence Score using c_v measure
coherence_model_lda = CoherenceModel(model=lda_model, texts=seeded_documents, dictionary=dictionary, coherence='c_v')
coherence_lda_cv = coherence_model_lda.get_coherence()

end_time_iter = time.time()  # End time for this iteration
iter_duration = round((end_time_iter - start_time_iter) / 60)

print(f"factor: {factor}, no_below: {no_below}, no_above: {no_above}, low_value: {low_value}, Coherence: {coherence_lda_cv}, Time: {iter_duration} minutes")

# Print topics for the current model
print("Topics for the current model:")
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx} \nWords: {topic}\n")
print("-"*20)

factor: 10, no_below: 15, no_above: 0.2, low_value: 0.1, Coherence: 0.47079236031672106, Time: 25 minutes
Topics for the current model:
Topic: 0 
Words: 0.093*"state" + 0.048*"president" + 0.029*"government" + 0.020*"law" + 0.016*"member" + 0.014*"congress" + 0.011*"election" + 0.010*"senate" + 0.008*"political" + 0.008*"policy"

Topic: 1 
Words: 0.108*"house" + 0.093*"room" + 0.020*"property" + 0.015*"rent" + 0.014*"estate" + 0.013*"apartment" + 0.010*"home" + 0.006*"new" + 0.005*"car" + 0.005*"lot"

Topic: 2 
Words: 0.072*"time" + 0.040*"man" + 0.020*"life" + 0.019*"world" + 0.009*"thought" + 0.008*"one" + 0.007*"idea" + 0.005*"mind" + 0.005*"reason" + 0.005*"said"

Topic: 3 
Words: 0.168*"home" + 0.080*"game" + 0.055*"team" + 0.051*"play" + 0.037*"win" + 0.035*"season" + 0.029*"club" + 0.021*"score" + 0.019*"player" + 0.017*"match"

Topic: 4 
Words: 0.057*"school" + 0.048*"member" + 0.046*"church" + 0.043*"club" + 0.041*"meeting" + 0.008*"community" + 0.006*"john" + 0.004*"president