In [10]:
import os  # For operating system dependent functionality
import time  # For timing code execution
import re  # For regular expressions

import pandas as pd  # For data manipulation and analysis
import nltk  # For natural language processing tasks
from nltk.corpus import stopwords  # For stopwords
from nltk.stem import WordNetLemmatizer  # For word lemmatization

# For Gensim models and functionalities
from gensim.models.phrases import Phrases, Phraser
from gensim.models import LdaModel, TfidfModel
from gensim.corpora import Dictionary

import spacy  # For advanced natural language processing

from gensim.models.coherencemodel import CoherenceModel
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from corextopic import corextopic as ct
from corextopic import vis_topic as vt

## Model 1 - Guided LDA (Manual Approach) utilizing Bigrams and TFIDF

In [2]:
# Define your directory and file name
directory = "D:/2_nlp"
file_name = "documents_with_bigrams.xlsx"
full_path = os.path.join(directory, file_name)

start_time = time.time()
# Load the Excel file
df_loaded = pd.read_excel(full_path)
end_time = time.time()
# Now, documents_with_bigrams_loaded contains your original list of lists structure
print("The total time taken in mins is {}".format(round((end_time - start_time) / 60)))

The total time taken in mins is 0


In [5]:
def train_lda_model(documents, num_topics, no_below, no_above, total_passes, random_state, low_value):
    """
    Trains an LDA model using documents that have been preprocessed, including phrase detection and NER.
    
    Parameters:
    - documents: List of preprocessed documents, each represented as a list of tokens.
    - num_topics: The desired number of topics.
    - no_below: Filter out tokens that appear in fewer than 'no_below' documents.
    - no_above: Filter out tokens that appear in more than 'no_above' proportion of documents.
    - total_passes: Number of passes through the corpus during training.
    - random_state: Seed for reproducibility.
    
    Returns:
    - lda_model: The trained LDA model.
    - dictionary: Gensim dictionary created from the documents.
    - corpus: Document-term matrix used for LDA training.
    """
    
    # Create a dictionary and filter extremes
    dictionary = Dictionary(documents)
    dictionary.filter_extremes(no_below=no_below, no_above=no_above)
    
    # Create the Document-Term Matrix
    corpus = [dictionary.doc2bow(doc) for doc in documents]
    
    # Apply TF-IDF filtering
    tfidf = TfidfModel(corpus)
    tfidf_corpus = [[(id, freq) for id, freq in doc if tfidf.idfs[id] > low_value] for doc in corpus]
    
    # Initialize and train the LDA model
    lda_model = LdaModel(corpus=tfidf_corpus, num_topics=num_topics, id2word=dictionary,
                         passes=total_passes, random_state=random_state)
    
    return lda_model, dictionary, tfidf_corpus

def emphasize_seed_words(document, seed_words, factor):
    """
    Duplicate seed words in the document to make them more influential by a specified factor.
    """
    emphasized_document = []
    for word in document:
        emphasized_document.append(word)
        if word in seed_words:
            emphasized_document.extend([word] * (factor - 1))  # Duplicate seed words by the factor
    return emphasized_document

# Update your document preparation to include seed word emphasis
def prepare_documents(df, seed_topics, factor):
    # Ensure all entries are strings and handle missing values
    documents = df['documents'].fillna('').astype(str).str.split().tolist()
    seed_words = [word for topic_words in seed_topics.values() for word in topic_words]
    emphasized_documents = [emphasize_seed_words(doc, seed_words, factor) for doc in documents]
    return emphasized_documents

In [6]:
seed_topics = {
    "Sports": ["game", "team", "season", "play", "club", "win", "match", "score", "player", "coach"],
    "Rentals & Real Estate": ["house", "home", "room", "property", "rent", "estate", "apartment", "building", "lease", "mortgage"],
    "Philosophy & Thought": ["time", "life", "man", "world", "philosophy", "thought", "mind", "idea", "reason", "belief"],
    "Community Gatherings/Events": ["church", "school", "event", "member", "community", "meeting", "ceremony", "celebration", "gathering", "festival"],
    "Politics/Government": ["president", "state", "government", "senate", "congress", "election", "policy", "law", "political", "diplomacy"]
}
factor = 10

seeded_documents = prepare_documents(df_loaded, seed_topics, factor)
# Set fixed hyperparameters
num_topics = 5
total_passes = 3

# Define the range of hyperparameters to explore
no_below = 15  # Example: Minimum document frequency
no_above = 0.2  # Example: Maximum document frequency proportion
low_value = 0.1  # TF-IDF low value cut-off

start_time_iter = time.time()  # Start time for this iteration

# Train the LDA model with the current set of hyperparameters

lda_model, dictionary, tfidf_corpus = train_lda_model(seeded_documents, num_topics=num_topics, no_below=no_below, no_above=no_above, total_passes=total_passes, random_state=100, low_value=low_value)

# Calculate Coherence Score using c_v measure
coherence_model_lda = CoherenceModel(model=lda_model, texts=seeded_documents, dictionary=dictionary, coherence='c_v')
coherence_lda_cv = coherence_model_lda.get_coherence()

end_time_iter = time.time()  # End time for this iteration
iter_duration = round((end_time_iter - start_time_iter) / 60)

print(f"factor: {factor}, no_below: {no_below}, no_above: {no_above}, low_value: {low_value}, Coherence: {coherence_lda_cv}, Time: {iter_duration} minutes")

# Print topics for the current model
print("Topics for the current model:")
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx} \nWords: {topic}\n")
print("-"*20)

factor: 10, no_below: 15, no_above: 0.2, low_value: 0.1, Coherence: 0.4713707960728266, Time: 26 minutes
Topics for the current model:
Topic: 0 
Words: 0.089*"state" + 0.049*"president" + 0.028*"government" + 0.019*"law" + 0.018*"member" + 0.014*"congress" + 0.011*"election" + 0.009*"senate" + 0.008*"political" + 0.007*"policy"

Topic: 1 
Words: 0.106*"house" + 0.090*"room" + 0.041*"home" + 0.019*"property" + 0.015*"rent" + 0.014*"estate" + 0.013*"apartment" + 0.006*"new" + 0.004*"car" + 0.004*"lot"

Topic: 2 
Words: 0.075*"time" + 0.041*"man" + 0.021*"life" + 0.019*"world" + 0.010*"thought" + 0.008*"one" + 0.007*"idea" + 0.006*"mind" + 0.005*"reason" + 0.005*"said"

Topic: 3 
Words: 0.089*"club" + 0.081*"game" + 0.055*"team" + 0.052*"play" + 0.045*"building" + 0.037*"win" + 0.035*"season" + 0.023*"event" + 0.021*"score" + 0.019*"player"

Topic: 4 
Words: 0.098*"home" + 0.055*"school" + 0.044*"church" + 0.043*"member" + 0.038*"meeting" + 0.005*"john" + 0.004*"ceremony" + 0.004*"william

### Topic 0 : Political Governance
### Topic 1: Rentals/Real estate/living spaces
### Topic 2: Philosophical concepts
### Topic 3: Sports
### Topic 4: Community gatherings/Social life

## Model 2 - Unguided LDA utilizing Bigrams and TFIDF

In [7]:
documents_with_bigrams = [str(doc).split(' ') for doc in df_loaded['documents']]

In [8]:
# Set fixed hyperparameters
num_topics = 5
total_passes = 3
no_below = 15
no_above = 0.2
low_value = 0.01

start_time_iter = time.time()  # Start time for this iteration
            
# Train the LDA model with the current set of hyperparameters

lda_model, dictionary, tfidf_corpus = train_lda_model(documents_with_bigrams, num_topics=num_topics, no_below=no_below, no_above=no_above, total_passes=total_passes, random_state=100, low_value=low_value)

# Calculate Coherence Score using c_v measure
coherence_model_lda = CoherenceModel(model=lda_model, texts=documents_with_bigrams, dictionary=dictionary, coherence='c_v')
coherence_lda_cv = coherence_model_lda.get_coherence()

end_time_iter = time.time()  # End time for this iteration
iter_duration = round((end_time_iter - start_time_iter) / 60)

print(f"no_below: {no_below}, no_above: {no_above}, low_value: {low_value}, Coherence: {coherence_lda_cv}, Time: {iter_duration} minutes")
            
# Print topics for the current model
print("Topics for the current model:")
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx} \nWords: {topic}\n")
print("-"*20)

no_below: 15, no_above: 0.2, low_value: 0.01, Coherence: 0.5292118758821858, Time: 28 minutes
Topics for the current model:
Topic: 0 
Words: 0.010*"game" + 0.007*"two" + 0.007*"one" + 0.007*"team" + 0.006*"first" + 0.006*"play" + 0.005*"new_york" + 0.005*"three" + 0.004*"season" + 0.004*"club"

Topic: 1 
Words: 0.011*"room" + 0.008*"new" + 0.008*"car" + 0.007*"lot" + 0.006*"500" + 0.006*"home" + 0.006*"phone" + 0.006*"350" + 0.006*"house" + 0.005*"ave"

Topic: 2 
Words: 0.009*"one" + 0.006*"said" + 0.005*"would" + 0.005*"time" + 0.004*"man" + 0.004*"day" + 0.004*"many" + 0.003*"say" + 0.003*"way" + 0.003*"two"

Topic: 3 
Words: 0.010*"home" + 0.007*"john" + 0.005*"church" + 0.005*"son" + 0.005*"william" + 0.005*"school" + 0.005*"street" + 0.005*"member" + 0.005*"miss" + 0.004*"club"

Topic: 4 
Words: 0.009*"state" + 0.009*"said" + 0.006*"year" + 0.005*"new" + 0.005*"would" + 0.005*"today" + 0.005*"president" + 0.004*"district" + 0.004*"may" + 0.004*"committee"

--------------------


### Topic 0 : Political Governance
### Topic 1: Rentals/Real estate/living spaces
### Topic 2: Philosophical concepts
### Topic 3: Sports
### Topic 4: Community gatherings/Social life

## Model 3 - Guided LDA with CorEx

In [11]:
def perform_guided_topic_modeling(df_loaded, seed_words, num_topics=5, ngram_range=(1,1), max_features=2000, anchor_strength=3, random_state=100):
    """
    Performs guided topic modeling using CorEx on a given dataset with seed words for topics.
    """
    # Convert the documents column to a list of documents
    df_loaded['documents'] = df_loaded['documents'].fillna('')
    documents_list = df_loaded['documents'].tolist()

    # Create a document-term matrix
    vectorizer = CountVectorizer(max_features=max_features, ngram_range=ngram_range)
    doc_word_matrix = vectorizer.fit_transform(documents_list)
    #doc_word_matrix = doc_word_matrix.toarray()  # Convert to array if needed

    words = list(np.asarray(vectorizer.get_feature_names_out()))

    # Instantiate and fit the CorEx model
    model = ct.Corex(n_hidden=num_topics, seed=random_state)
    model.fit(doc_word_matrix, words=words, anchors=seed_words, anchor_strength=anchor_strength)

    return model, words

# Assuming df_loaded is your loaded DataFrame with the 'documents' column
num_topics = 5
ngram_range = (1,1)
max_features = 20000
anchor_strength = 3
random_state = 100
seed_words = [
    ["game", "team", "season", "play", "club", "win", "match", "score", "player", "coach"],
    ["house", "home", "room", "property", "rent", "estate", "apartment", "building", "lease", "mortgage"],
    ["time", "life", "man", "world", "philosophy", "thought", "mind", "idea", "reason", "belief"],
    ["church", "school", "event", "member", "community", "meeting", "ceremony", "celebration", "gathering", "festival"],
    ["president", "state", "government", "senate", "congress", "election", "policy", "law", "political", "diplomacy"]
]


start_time = time.time()
model, words = perform_guided_topic_modeling(df_loaded, seed_words, num_topics=num_topics, ngram_range=ngram_range,
                                     max_features=max_features, anchor_strength=anchor_strength, random_state=random_state)

topics = model.get_topics()
print("Max features parameter value is {}".format(max_features))
for n, topic in enumerate(topics):
    print(f"Topic {n}:")
    for word, weight, correlation in topic:
        print(f"{word} {weight} {correlation}")
    print()

print()
end_time = time.time()
print("The total time taken in mins is {}".format(round((end_time - start_time) / 60)))
print("-"*20)

Max features parameter value is 20000
Topic 0:
game 0.24599339220013333 1.0
team 0.15906783920078568 1.0
play 0.1543643886541341 1.0
season 0.09097637130320244 1.0
win 0.08268616178949217 1.0
club 0.07707404226812839 1.0
player 0.07386469422388808 1.0
score 0.06779579290373153 1.0
two 0.04583469134462471 1.0
coach 0.04485859775810235 1.0

Topic 1:
room 0.11916416528342097 1.0
house 0.06689389157650763 1.0
home 0.06389565580179099 1.0
ave 0.06253313567951498 1.0
rent 0.05792231026811728 1.0
call 0.050620644632142615 1.0
500 0.04216220984559702 1.0
bath 0.04053681365725354 1.0
350 0.03634583043422621 1.0
phone 0.03116802509117465 1.0

Topic 2:
time 0.24966413344011112 1.0
one 0.10377713339238194 1.0
man 0.10287236864570998 1.0
life 0.08020033268085972 1.0
world 0.07729381716308922 1.0
thought 0.06347823702770003 1.0
reason 0.053425403759152806 1.0
idea 0.050720793705444606 1.0
make 0.05010709963558555 1.0
many 0.05002916299891667 1.0

Topic 3:
church 0.21007647134907936 1.0
member 0.1409

### Topic 0 : Political Governance
### Topic 1: Rentals/Real estate/living spaces
### Topic 2: Philosophical concepts
### Topic 3: Sports
### Topic 4: Community gatherings/Social life