In [1]:
import os  # For operating system dependent functionality
import time  # For timing code execution
import re  # For regular expressions

import pandas as pd  # For data manipulation and analysis
import nltk  # For natural language processing tasks
from nltk.corpus import stopwords  # For stopwords
from nltk.stem import WordNetLemmatizer  # For word lemmatization

# For Gensim models and functionalities
from gensim.models.phrases import Phrases, Phraser
from gensim.models import LdaModel, TfidfModel
from gensim.corpora import Dictionary

import spacy  # For advanced natural language processing

from gensim.models.coherencemodel import CoherenceModel
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from corextopic import corextopic as ct
from corextopic import vis_topic as vt

  from pandas.core import (


In [2]:
# Define your directory and file name
directory = "D:/2_nlp"
file_name = "documents_with_bigrams.xlsx"
full_path = os.path.join(directory, file_name)

start_time = time.time()
# Load the Excel file
df_loaded = pd.read_excel(full_path)

# Convert the 'documents' column back to a list of lists
documents_with_bigrams = [str(doc).split(' ') for doc in df_loaded['documents']]
end_time = time.time()
# Now, documents_with_bigrams_loaded contains your original list of lists structure
print("The total time taken in mins is {}".format(round((end_time - start_time) / 60)))

The total time taken in mins is 1


In [3]:
# Define your directory and file name
directory = "D:/2_nlp"
file_name = "Prepared_last_25_years_250_MB_dataset.xlsx"
full_path = os.path.join(directory, file_name)
start_time = time.time()
# Load the Excel file
entire_df_loaded = pd.read_excel(full_path)
end_time = time.time()
print("The total time taken in mins is {}".format(round((end_time - start_time) / 60)))

The total time taken in mins is 4


In [4]:
entire_df_loaded['date'] = pd.to_datetime(entire_df_loaded['date'])

In [5]:
def train_lda_model(documents, num_topics, no_below, no_above, total_passes, random_state, low_value):
    """
    Trains an LDA model using documents that have been preprocessed, including phrase detection and NER.
    
    Parameters:
    - documents: List of preprocessed documents, each represented as a list of tokens.
    - num_topics: The desired number of topics.
    - no_below: Filter out tokens that appear in fewer than 'no_below' documents.
    - no_above: Filter out tokens that appear in more than 'no_above' proportion of documents.
    - total_passes: Number of passes through the corpus during training.
    - random_state: Seed for reproducibility.
    
    Returns:
    - lda_model: The trained LDA model.
    - dictionary: Gensim dictionary created from the documents.
    - corpus: Document-term matrix used for LDA training.
    """
    
    # Create a dictionary and filter extremes
    dictionary = Dictionary(documents)
    dictionary.filter_extremes(no_below=no_below, no_above=no_above)
    
    # Create the Document-Term Matrix
    corpus = [dictionary.doc2bow(doc) for doc in documents]
    
    # Apply TF-IDF filtering
    tfidf = TfidfModel(corpus)
    tfidf_corpus = [[(id, freq) for id, freq in doc if tfidf.idfs[id] > low_value] for doc in corpus]
    
    # Initialize and train the LDA model
    lda_model = LdaModel(corpus=tfidf_corpus, num_topics=num_topics, id2word=dictionary,
                         passes=total_passes, random_state=random_state)
    
    return lda_model, dictionary, tfidf_corpus

def emphasize_seed_words(document, seed_words, factor):
    """
    Duplicate seed words in the document to make them more influential by a specified factor.
    """
    emphasized_document = []
    for word in document:
        emphasized_document.append(word)
        if word in seed_words:
            emphasized_document.extend([word] * (factor - 1))  # Duplicate seed words by the factor
    return emphasized_document

# Update your document preparation to include seed word emphasis
def prepare_documents(df, seed_topics, factor):
    # Ensure all entries are strings and handle missing values
    documents = df['documents'].fillna('').astype(str).str.split().tolist()
    seed_words = [word for topic_words in seed_topics.values() for word in topic_words]
    emphasized_documents = [emphasize_seed_words(doc, seed_words, factor) for doc in documents]
    return emphasized_documents

In [6]:
seed_topics = {
    "Sports": ["game", "team", "season", "play", "club", "win", "match", "score", "player", "coach"],
    "Rentals & Real Estate": ["house", "home", "room", "property", "rent", "estate", "apartment", "building", "lease", "mortgage"],
    "Philosophy & Thought": ["time", "life", "man", "world", "philosophy", "thought", "mind", "idea", "reason", "belief"],
    "Community Gatherings/Events": ["church", "school", "event", "member", "community", "meeting", "ceremony", "celebration", "gathering", "festival"],
    "Politics/Government": ["president", "state", "government", "senate", "congress", "election", "policy", "law", "political", "diplomacy"]
}
factor = 10

seeded_documents = prepare_documents(df_loaded, seed_topics, factor)
# Set fixed hyperparameters
num_topics = 5
total_passes = 3

# Define the range of hyperparameters to explore
no_below = 15  # Example: Minimum document frequency
no_above = 0.2  # Example: Maximum document frequency proportion
low_value = 0.1  # TF-IDF low value cut-off

start_time_iter = time.time()  # Start time for this iteration

# Train the LDA model with the current set of hyperparameters

lda_model, dictionary, tfidf_corpus = train_lda_model(seeded_documents, num_topics=num_topics, no_below=no_below, no_above=no_above, total_passes=total_passes, random_state=100, low_value=low_value)

# Calculate Coherence Score using c_v measure
coherence_model_lda = CoherenceModel(model=lda_model, texts=seeded_documents, dictionary=dictionary, coherence='c_v')
coherence_lda_cv = coherence_model_lda.get_coherence()

end_time_iter = time.time()  # End time for this iteration
iter_duration = round((end_time_iter - start_time_iter) / 60)

print(f"factor: {factor}, no_below: {no_below}, no_above: {no_above}, low_value: {low_value}, Coherence: {coherence_lda_cv}, Time: {iter_duration} minutes")

# Print topics for the current model
print("Topics for the current model:")
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx} \nWords: {topic}\n")
print("-"*20)

factor: 10, no_below: 15, no_above: 0.2, low_value: 0.1, Coherence: 0.4713707960728266, Time: 25 minutes
Topics for the current model:
Topic: 0 
Words: 0.089*"state" + 0.049*"president" + 0.028*"government" + 0.019*"law" + 0.018*"member" + 0.014*"congress" + 0.011*"election" + 0.009*"senate" + 0.008*"political" + 0.007*"policy"

Topic: 1 
Words: 0.106*"house" + 0.090*"room" + 0.041*"home" + 0.019*"property" + 0.015*"rent" + 0.014*"estate" + 0.013*"apartment" + 0.006*"new" + 0.004*"car" + 0.004*"lot"

Topic: 2 
Words: 0.075*"time" + 0.041*"man" + 0.021*"life" + 0.019*"world" + 0.010*"thought" + 0.008*"one" + 0.007*"idea" + 0.006*"mind" + 0.005*"reason" + 0.005*"said"

Topic: 3 
Words: 0.089*"club" + 0.081*"game" + 0.055*"team" + 0.052*"play" + 0.045*"building" + 0.037*"win" + 0.035*"season" + 0.023*"event" + 0.021*"score" + 0.019*"player"

Topic: 4 
Words: 0.098*"home" + 0.055*"school" + 0.044*"church" + 0.043*"member" + 0.038*"meeting" + 0.005*"john" + 0.004*"ceremony" + 0.004*"william

In [7]:
# Step 1: Prepare the corpus from 'prepared_text', ensuring all entries are treated as strings
prepared_texts = entire_df_loaded['prepared_text'].astype(str).tolist()
articles_corpus = [dictionary.doc2bow(str(doc).split()) for doc in prepared_texts]

# Proceed with Steps 2 to 4 as before
# Step 2: Get the topic distribution for each document
topic_distributions = [lda_model.get_document_topics(bow) for bow in articles_corpus]

# Step 3: Find the dominant topic for each document
dominant_topics = [max(dist, key=lambda x: x[1])[0] for dist in topic_distributions]

# Step 4: Assign the dominant topics to the DataFrame
entire_df_loaded['dominant_topic'] = dominant_topics

In [9]:
# Now, entire_df_loaded includes a 'dominant_topic' column with the dominant topic for each document
entire_df_loaded[['prepared_text', 'dominant_topic']].head(100)

Unnamed: 0,prepared_text,dominant_topic
0,dark name lockmon jansen giont coach san franc...,3
1,faloney figure swap etcheverry deal rock conod...,3
2,home ice help chicago gain lead leoque there p...,3
3,chamberlain fire 4o la wilt chamberlain keep s...,3
4,raped home run first time faced major league p...,3
...,...,...
95,gourmet salad dressing u cup finely chopped gr...,2
96,2 3ounce package cream cheese teaspoon grated ...,1
97,new improved crystal clear food wrap thats eas...,2
98,refreshing change highway departmentpar serv i...,0


In [10]:
entire_df_loaded.to_excel("guided_lda_dominant_topic_classification.xlsx", index=False)