In [5]:
import pandas as pd
import os
import re
import time
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel, LdaModel
import numpy as np

In [2]:
directory = "D:/2_nlp"
# Define the file names
prepared_file_name_250_MB = "Prepared_last_25_years_250_MB_dataset.xlsx"
# Construct the full file paths
prepared_file_path_250_MB = os.path.join(directory, prepared_file_name_250_MB)

# Read the .xlsx files into DataFrames
prepared_last_25_years_df = pd.read_excel(prepared_file_path_250_MB)

In [3]:
start_time = time.time()

# Ensure all entries in 'prepared_text' are treated as strings
documents = prepared_last_25_years_df['prepared_text'].astype(str).apply(lambda x: x.split()).tolist()

# Continue with filtering out words with less than 3 characters
documents = [[word for word in doc if len(word) > 2] for doc in documents]

end_time = time.time()
print("The total time taken in mins is {}".format(round((end_time - start_time) / 60)))

The total time taken in mins is 1


## Unguided LDA

In [7]:
def train_lda_model(documents, num_topics, no_below, no_above, total_passes, random_state):
    """
    Trains an LDA model using documents that have been preprocessed.
    
    Parameters:
    - documents: List of documents, where each document is a list of words.
    - num_topics: The number of topics to discover.
    - no_below: Minimum number of documents a word must appear in to be kept.
    - no_above: Maximum proportion of documents a word can appear in to be kept.
    - total_passes: The number of passes through the corpus during training.
    - random_state: Seed for reproducibility.
    
    Returns:
    - lda_model: The trained LDA model.
    - dictionary: The Gensim dictionary created from the documents.
    - corpus: The document-term matrix generated from the documents.
    """
    
    # Create a dictionary representation of the documents
    dictionary = Dictionary(documents)
    
    # Filter out extremes to remove too rare or too common words
    dictionary.filter_extremes(no_below=no_below, no_above=no_above)
    
    # Create the Document-Term Matrix
    corpus = [dictionary.doc2bow(doc) for doc in documents]
    
    # Initialize the LDA model
    lda_model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary,
                         passes=total_passes, random_state=random_state, eval_every=None) 
    
    return lda_model, dictionary, corpus

In [8]:
# Set fixed hyperparameters
num_topics = 5
total_passes = 3
no_below = 20
random_state = 100

# Define the range of no_above to explore
no_above_values = [0.05, 0.06, 0.07, 0.08, 0.09, 0.1]

# Initialize a list to store the results
results = []

# Loop through each value of no_above
for no_above in no_above_values:
    start_time_iter = time.time()  # Start time for this iteration
    
    # Train the LDA model for the current value of no_above
    lda_model, dictionary, corpus = train_lda_model(documents, num_topics, no_below, no_above, total_passes, random_state)
    
    # Calculate Coherence Score using c_v measure
    coherence_model_lda = CoherenceModel(model=lda_model, texts=documents, dictionary=dictionary, coherence='c_v')
    coherence_lda_cv = coherence_model_lda.get_coherence()
    
    end_time_iter = time.time()  # End time for this iteration
    iter_duration = round((end_time_iter - start_time_iter) / 60)  # Rounded to 0 decimal places
    
    # Store the results
    results.append({
        'no_above': no_above,
        'coherence_score': coherence_lda_cv,
        'iteration_time_mins': iter_duration
    })
    
    # Print the coherence score for the current model
    print(f"no_above: {no_above}, Coherence: {coherence_lda_cv}, Time: {iter_duration} mins")
    
    # Print the topics for the current model
    print("\nTopics for the current model:")
    for idx, topic in lda_model.print_topics(-1):
        print(f"Topic: {idx} \nWords: {topic}")
    print("-" * 20)

# Find and print the best result
best_result = max(results, key=lambda x: x['coherence_score'])
print("\nBest Model's Params: ", best_result)
print("Best Coherence Score: ", best_result['coherence_score'])
print("Time for Best Model: ", best_result['iteration_time_mins'])

no_above: 0.05, Coherence: 0.5438089910344249, Time: 20 mins

Topics for the current model:
Topic: 0 
Words: 0.011*"court" + 0.010*"county" + 0.010*"funeral" + 0.009*"lot" + 0.008*"columbia" + 0.005*"estate" + 0.005*"road" + 0.005*"notice" + 0.005*"cemetery" + 0.005*"late"
Topic: 1 
Words: 0.005*"guest" + 0.004*"oclock" + 0.004*"daughter" + 0.004*"rev" + 0.003*"mary" + 0.003*"charles" + 0.003*"afternoon" + 0.003*"friday" + 0.003*"james" + 0.003*"meet"
Topic: 2 
Words: 0.006*"cent" + 0.004*"government" + 0.004*"committee" + 0.003*"roosevelt" + 0.003*"board" + 0.003*"court" + 0.003*"bill" + 0.003*"tax" + 0.003*"business" + 0.003*"federal"
Topic: 3 
Words: 0.008*"bath" + 0.008*"car" + 0.006*"phone" + 0.005*"size" + 0.005*"350" + 0.004*"motor" + 0.004*"sedan" + 0.004*"lot" + 0.004*"box" + 0.004*"floor"
Topic: 4 
Words: 0.005*"game" + 0.003*"little" + 0.003*"team" + 0.002*"great" + 0.002*"thing" + 0.002*"know" + 0.002*"play" + 0.002*"much" + 0.002*"league" + 0.002*"could"
------------------