In [1]:
import os  # For operating system dependent functionality
import time  # For timing code execution
import re  # For regular expressions

import pandas as pd  # For data manipulation and analysis
import nltk  # For natural language processing tasks
from nltk.corpus import stopwords  # For stopwords
from nltk.stem import WordNetLemmatizer  # For word lemmatization

# For Gensim models and functionalities
from gensim.models.phrases import Phrases, Phraser
from gensim.models import LdaModel, TfidfModel
from gensim.corpora import Dictionary

import spacy  # For advanced natural language processing

from gensim.models.coherencemodel import CoherenceModel
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from corextopic import corextopic as ct
from corextopic import vis_topic as vt

  from pandas.core import (


In [2]:
# Define your directory and file name
directory = "D:/2_nlp"
file_name = "documents_with_bigrams.xlsx"
full_path = os.path.join(directory, file_name)

start_time = time.time()
# Load the Excel file
df_loaded = pd.read_excel(full_path)

# Convert the 'documents' column back to a list of lists
#documents_with_bigrams = [str(doc).split(' ') for doc in df_loaded['documents']]
end_time = time.time()
# Now, documents_with_bigrams_loaded contains your original list of lists structure
print("The total time taken in mins is {}".format(round((end_time - start_time) / 60)))

The total time taken in mins is 1


In [3]:
def perform_guided_topic_modeling(df_loaded, seed_words, num_topics=5, ngram_range=(1,1), max_features=2000, anchor_strength=3, random_state=100):
    """
    Performs guided topic modeling using CorEx on a given dataset with seed words for topics.
    """
    # Convert the documents column to a list of documents
    df_loaded['documents'] = df_loaded['documents'].fillna('')
    documents_list = df_loaded['documents'].tolist()

    # Create a document-term matrix
    vectorizer = CountVectorizer(max_features=max_features, ngram_range=ngram_range)
    doc_word_matrix = vectorizer.fit_transform(documents_list)
    #doc_word_matrix = doc_word_matrix.toarray()  # Convert to array if needed

    words = list(np.asarray(vectorizer.get_feature_names_out()))

    # Instantiate and fit the CorEx model
    model = ct.Corex(n_hidden=num_topics, seed=random_state)
    model.fit(doc_word_matrix, words=words, anchors=seed_words, anchor_strength=anchor_strength)

    return model, words

# Assuming df_loaded is your loaded DataFrame with the 'documents' column
num_topics = 5
ngram_range = (1,1)
max_features_list = [10000,15000,20000,25000]
for max_features in max_features_list:
    anchor_strength = 3
    random_state = 100
    seed_words = [
        ["game", "team", "season", "play", "club", "win", "match", "score", "player", "coach"],
        ["house", "home", "room", "property", "rent", "estate", "apartment", "building", "lease", "mortgage"],
        ["time", "life", "man", "world", "philosophy", "thought", "mind", "idea", "reason", "belief"],
        ["church", "school", "event", "member", "community", "meeting", "ceremony", "celebration", "gathering", "festival"],
        ["president", "state", "government", "senate", "congress", "election", "policy", "law", "political", "diplomacy"]
    ]


    start_time = time.time()
    model, words = perform_guided_topic_modeling(df_loaded, seed_words, num_topics=num_topics, ngram_range=ngram_range,
                                         max_features=max_features, anchor_strength=anchor_strength, random_state=random_state)

    topics = model.get_topics()
    print("Max features parameter value is {}".format(max_features))
    for n, topic in enumerate(topics):
        print(f"Topic {n}:")
        for word, weight, correlation in topic:
            print(f"{word} {weight} {correlation}")
        print()

    print()
    end_time = time.time()
    print("The total time taken in mins is {}".format(round((end_time - start_time) / 60)))
    print("-"*20)

Max features parameter value is 10000
Topic 0:
game 0.23525106234612286 1.0
play 0.20011116293510933 1.0
team 0.15223235791829645 1.0
season 0.1456261957224093 1.0
win 0.09087565563401696 1.0
score 0.08255975349528064 1.0
club 0.08142741507936954 1.0
player 0.070738360157169 1.0
coach 0.053631964643412265 1.0
two 0.04628797682256836 1.0

Topic 1:
room 0.13030623794821106 1.0
house 0.07788210657146924 1.0
home 0.07151184421530835 1.0
rent 0.062293785506318616 1.0
ave 0.062028822883937317 1.0
call 0.051786076634804064 1.0
500 0.04302404013740236 1.0
bath 0.041753966175618336 1.0
350 0.03658031683497729 1.0
phone 0.031625709487988686 1.0

Topic 2:
time 0.2583269239231036 1.0
man 0.1069896960904578 1.0
one 0.10460032877530935 1.0
life 0.08180713424400748 1.0
world 0.07919642767269595 1.0
thought 0.065256597175775 1.0
reason 0.05446803716779221 1.0
idea 0.051850039639313394 1.0
many 0.0504068273387148 1.0
make 0.05033514980210805 1.0

Topic 3:
church 0.22341769720228455 1.0
member 0.1553254