In [5]:
import os  # For operating system dependent functionality
import time  # For timing code execution
import re  # For regular expressions

import pandas as pd  # For data manipulation and analysis
import nltk  # For natural language processing tasks
from nltk.corpus import stopwords  # For stopwords
from nltk.stem import WordNetLemmatizer  # For word lemmatization

# For Gensim models and functionalities
from gensim.models.phrases import Phrases, Phraser
from gensim.models import LdaModel, TfidfModel
from gensim.corpora import Dictionary

import spacy  # For advanced natural language processing

from gensim.models.coherencemodel import CoherenceModel
import numpy as np

In [6]:
# Define your directory and file name
directory = "D:/2_nlp"
file_name = "documents_with_bigrams.xlsx"
full_path = os.path.join(directory, file_name)

start_time = time.time()
# Load the Excel file
df_loaded = pd.read_excel(full_path)

# Convert the 'documents' column back to a list of lists
documents_with_bigrams = [str(doc).split(' ') for doc in df_loaded['documents']]
end_time = time.time()
# Now, documents_with_bigrams_loaded contains your original list of lists structure
print("The total time taken in mins is {}".format(round((end_time - start_time) / 60)))

The total time taken in mins is 1


In [7]:
def train_lda_model(documents, num_topics, no_below, no_above, total_passes, random_state, low_value):
    """
    Trains an LDA model using documents that have been preprocessed, including phrase detection and NER.
    
    Parameters:
    - documents: List of preprocessed documents, each represented as a list of tokens.
    - num_topics: The desired number of topics.
    - no_below: Filter out tokens that appear in fewer than 'no_below' documents.
    - no_above: Filter out tokens that appear in more than 'no_above' proportion of documents.
    - total_passes: Number of passes through the corpus during training.
    - random_state: Seed for reproducibility.
    
    Returns:
    - lda_model: The trained LDA model.
    - dictionary: Gensim dictionary created from the documents.
    - corpus: Document-term matrix used for LDA training.
    """
    
    # Create a dictionary and filter extremes
    dictionary = Dictionary(documents)
    dictionary.filter_extremes(no_below=no_below, no_above=no_above)
    
    # Create the Document-Term Matrix
    corpus = [dictionary.doc2bow(doc) for doc in documents]
    
    # Apply TF-IDF filtering
    tfidf = TfidfModel(corpus)
    tfidf_corpus = [[(id, freq) for id, freq in doc if tfidf.idfs[id] > low_value] for doc in corpus]
    
    # Initialize and train the LDA model
    lda_model = LdaModel(corpus=tfidf_corpus, num_topics=num_topics, id2word=dictionary,
                         passes=total_passes, random_state=random_state)
    
    return lda_model, dictionary, tfidf_corpus

In [8]:
# Set fixed hyperparameters
num_topics = 5
total_passes = 3

# Define the range of hyperparameters to explore
no_below_range = [15, 20, 25]  # Example: Minimum document frequency
no_above_range = [0.1, 0.2, 0.3]  # Example: Maximum document frequency proportion
low_value_range = [0.01, 0.05, 0.1]  # TF-IDF low value cut-off

# Initialize a list to store the results
results = []

# Perform grid search with timing and topic printing
for no_below in no_below_range:
    for no_above in no_above_range:
        for low_value in low_value_range:
            start_time_iter = time.time()  # Start time for this iteration
            
            # Train the LDA model with the current set of hyperparameters
            
            lda_model, dictionary, tfidf_corpus = train_lda_model(documents_with_bigrams, num_topics=num_topics, no_below=no_below, no_above=no_above, total_passes=total_passes, random_state=100, low_value=low_value)

            # Calculate Coherence Score using c_v measure
            coherence_model_lda = CoherenceModel(model=lda_model, texts=documents_with_bigrams, dictionary=dictionary, coherence='c_v')
            coherence_lda_cv = coherence_model_lda.get_coherence()
            
            end_time_iter = time.time()  # End time for this iteration
            iter_duration = round((end_time_iter - start_time_iter) / 60)
            
            # Store the results with iteration time
            results.append({
                'no_below': no_below,
                'no_above': no_above,
                'low_value': low_value,
                'coherence_score': coherence_lda_cv,
                'iteration_time_mins': iter_duration
            })
            
            print(f"no_below: {no_below}, no_above: {no_above}, low_value: {low_value}, Coherence: {coherence_lda_cv}, Time: {iter_duration} minutes")
            
            # Print topics for the current model
            print("Topics for the current model:")
            for idx, topic in lda_model.print_topics(-1):
                print(f"Topic: {idx} \nWords: {topic}\n")
            print("-"*20)

# Find and print the best result
best_result = max(results, key=lambda x: x['coherence_score'])
print("\nBest Model's Params: ", best_result)
print("Best Coherence Score: ", best_result['coherence_score'])
print("Time for Best Model: ", best_result['iteration_time_mins'])

no_below: 15, no_above: 0.1, low_value: 0.01, Coherence: 0.5756896896685613, Time: 24 minutes
Topics for the current model:
Topic: 0 
Words: 0.011*"john" + 0.008*"son" + 0.008*"william" + 0.007*"street" + 0.006*"george" + 0.006*"miss" + 0.006*"james" + 0.006*"mary" + 0.005*"daughter" + 0.005*"charles"

Topic: 1 
Words: 0.004*"would" + 0.004*"man" + 0.004*"good" + 0.003*"make" + 0.003*"many" + 0.003*"way" + 0.003*"little" + 0.003*"get" + 0.003*"say" + 0.003*"made"

Topic: 2 
Words: 0.009*"club" + 0.008*"game" + 0.006*"school" + 0.006*"washington" + 0.005*"team" + 0.005*"member" + 0.005*"meeting" + 0.005*"new_york" + 0.004*"church" + 0.004*"held"

Topic: 3 
Words: 0.010*"room" + 0.007*"car" + 0.007*"lot" + 0.007*"500" + 0.006*"sale" + 0.006*"350" + 0.006*"phone" + 0.005*"house" + 0.005*"ave" + 0.005*"price"

Topic: 4 
Words: 0.009*"state" + 0.006*"would" + 0.004*"president" + 0.004*"government" + 0.004*"work" + 0.004*"city" + 0.004*"per_cent" + 0.004*"made" + 0.003*"united_state" + 0.003

no_below: 20, no_above: 0.1, low_value: 0.01, Coherence: 0.5756896896685613, Time: 25 minutes
Topics for the current model:
Topic: 0 
Words: 0.011*"john" + 0.008*"son" + 0.008*"william" + 0.007*"street" + 0.006*"george" + 0.006*"miss" + 0.006*"james" + 0.006*"mary" + 0.005*"daughter" + 0.005*"charles"

Topic: 1 
Words: 0.004*"would" + 0.004*"man" + 0.004*"good" + 0.003*"make" + 0.003*"many" + 0.003*"way" + 0.003*"little" + 0.003*"get" + 0.003*"say" + 0.003*"made"

Topic: 2 
Words: 0.009*"club" + 0.008*"game" + 0.006*"school" + 0.006*"washington" + 0.005*"team" + 0.005*"member" + 0.005*"meeting" + 0.005*"new_york" + 0.004*"church" + 0.004*"held"

Topic: 3 
Words: 0.010*"room" + 0.007*"car" + 0.007*"lot" + 0.007*"500" + 0.006*"sale" + 0.006*"350" + 0.006*"phone" + 0.005*"house" + 0.005*"ave" + 0.005*"price"

Topic: 4 
Words: 0.009*"state" + 0.006*"would" + 0.004*"president" + 0.004*"government" + 0.004*"work" + 0.004*"city" + 0.004*"per_cent" + 0.004*"made" + 0.003*"united_state" + 0.003

KeyboardInterrupt: 