In [1]:
import os  # For operating system dependent functionality
import time  # For timing code execution
import re  # For regular expressions

import pandas as pd  # For data manipulation and analysis
import nltk  # For natural language processing tasks
from nltk.corpus import stopwords  # For stopwords
from nltk.stem import WordNetLemmatizer  # For word lemmatization

# For Gensim models and functionalities
from gensim.models.phrases import Phrases, Phraser
from gensim.models import LdaModel, TfidfModel
from gensim.corpora import Dictionary

import spacy  # For advanced natural language processing

from gensim.models.coherencemodel import CoherenceModel
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

  from pandas.core import (


In [2]:
# Define your directory and file name
directory = "D:/2_nlp"
file_name = "documents_with_bigrams.xlsx"
full_path = os.path.join(directory, file_name)

start_time = time.time()
# Load the Excel file
df_loaded = pd.read_excel(full_path)

# Convert the 'documents' column back to a list of lists
documents_with_bigrams = [str(doc).split(' ') for doc in df_loaded['documents']]
end_time = time.time()
# Now, documents_with_bigrams_loaded contains your original list of lists structure
print("The total time taken in mins is {}".format(round((end_time - start_time) / 60)))

The total time taken in mins is 1


In [3]:
# Define your directory and file name
directory = "D:/2_nlp"
file_name = "Prepared_last_25_years_250_MB_dataset.xlsx"
full_path = os.path.join(directory, file_name)
start_time = time.time()
# Load the Excel file
entire_df_loaded = pd.read_excel(full_path)
end_time = time.time()
print("The total time taken in mins is {}".format(round((end_time - start_time) / 60)))

The total time taken in mins is 4


In [4]:
entire_df_loaded['date'] = pd.to_datetime(entire_df_loaded['date'])

In [5]:
entire_df_loaded.head()

Unnamed: 0,article_id,newspaper_name,edition,date,page,headline,byline,article,text_for_analysis,prepared_text,year,month,day
0,1_1960-11-10_p36_sn83045462_00280608075_196011...,Evening star.,1,1960-11-10,p36,"Dark Names Lockmon ,\njansen Giont Coaches",,SAN FRANCISCO. Nov. 10\n(AP).-Alvin Dark made ...,"Dark Names Lockmon ,\njansen Giont Coaches SAN...",dark name lockmon jansen giont coach san franc...,1960,11,10
1,2_1960-11-10_p36_sn83045462_00280608075_196011...,Evening star.,1,1960-11-10,p36,FALONEY FIGURES IN SWAP\n\n\nEtcheverry Deal R...,,head last Saturday when Mon\ntreal lost in q c...,FALONEY FIGURES IN SWAP\n\n\nEtcheverry Deal R...,faloney figure swap etcheverry deal rock conod...,1960,11,10
2,3_1960-11-10_p36_sn83045462_00280608075_196011...,Evening star.,1,1960-11-10,p36,Home Ice Helps\nChicago to Gain\nLead in Leoque,BY the Associated Press,There's just no place like\nhome as far as the...,Home Ice Helps\nChicago to Gain\nLead in Leoqu...,home ice help chicago gain lead leoque there p...,1960,11,10
3,4_1960-11-10_p36_sn83045462_00280608075_196011...,Evening star.,1,1960-11-10,p36,Chamberlain\nFires In 4o\nAgainst LA,b the Associated rfess,Wilt Chamberlain keeps on\nscoring baskets and...,Chamberlain\nFires In 4o\nAgainst LA Wilt Cham...,chamberlain fire 4o la wilt chamberlain keep s...,1960,11,10
4,5_1960-11-10_p36_sn83045462_00280608075_196011...,Evening star.,1,1960-11-10,p36,,,raped a home run the first\ntime he faced a ma...,raped a home run the first\ntime he faced a m...,raped home run first time faced major league p...,1960,11,10


## Optimized Unguided LDA model

In [6]:
def train_lda_model(documents, num_topics, no_below, no_above, total_passes, random_state, low_value):
    """
    Trains an LDA model using documents that have been preprocessed, including phrase detection and NER.
    
    Parameters:
    - documents: List of preprocessed documents, each represented as a list of tokens.
    - num_topics: The desired number of topics.
    - no_below: Filter out tokens that appear in fewer than 'no_below' documents.
    - no_above: Filter out tokens that appear in more than 'no_above' proportion of documents.
    - total_passes: Number of passes through the corpus during training.
    - random_state: Seed for reproducibility.
    
    Returns:
    - lda_model: The trained LDA model.
    - dictionary: Gensim dictionary created from the documents.
    - corpus: Document-term matrix used for LDA training.
    """
    
    # Create a dictionary and filter extremes
    dictionary = Dictionary(documents)
    dictionary.filter_extremes(no_below=no_below, no_above=no_above)
    
    # Create the Document-Term Matrix
    corpus = [dictionary.doc2bow(doc) for doc in documents]
    
    # Apply TF-IDF filtering
    tfidf = TfidfModel(corpus)
    tfidf_corpus = [[(id, freq) for id, freq in doc if tfidf.idfs[id] > low_value] for doc in corpus]
    
    # Initialize and train the LDA model
    lda_model = LdaModel(corpus=tfidf_corpus, num_topics=num_topics, id2word=dictionary,
                         passes=total_passes, random_state=random_state)
    
    return lda_model, dictionary, tfidf_corpus

In [7]:
# Set fixed hyperparameters
num_topics = 5
total_passes = 3
random_state = 100
# Define the range of hyperparameters to explore
no_below = 15  # Example: Minimum document frequency
no_above = 0.2  # Example: Maximum document frequency proportion
low_value = 0.1  # TF-IDF low value cut-off

start_time_iter = time.time()  # Start time for this iteration

# Train the LDA model with the current set of hyperparameters

unguided_lda_model, unguided_dictionary, unguided_tfidf_corpus = train_lda_model(documents_with_bigrams, num_topics=num_topics, no_below=no_below, no_above=no_above, total_passes=total_passes, random_state=100, low_value=low_value)

# Calculate Coherence Score using c_v measure
unguided_coherence_model_lda = CoherenceModel(model=unguided_lda_model, texts=documents_with_bigrams, dictionary=unguided_dictionary, coherence='c_v')
coherence_lda_cv = unguided_coherence_model_lda.get_coherence()

end_time_iter = time.time()  # End time for this iteration
iter_duration = round((end_time_iter - start_time_iter) / 60)

print(f"no_below: {no_below}, no_above: {no_above}, low_value: {low_value}, Coherence: {coherence_lda_cv}, Time: {iter_duration} minutes")

# Print topics for the current model
print("Topics for the current model:")
for idx, topic in unguided_lda_model.print_topics(-1):
    print(f"Topic: {idx} \nWords: {topic}\n")
print("-"*20)

no_below: 15, no_above: 0.2, low_value: 0.1, Coherence: 0.5292118758821858, Time: 24 minutes
Topics for the current model:
Topic: 0 
Words: 0.010*"game" + 0.007*"two" + 0.007*"one" + 0.007*"team" + 0.006*"first" + 0.006*"play" + 0.005*"new_york" + 0.005*"three" + 0.004*"season" + 0.004*"club"

Topic: 1 
Words: 0.011*"room" + 0.008*"new" + 0.008*"car" + 0.007*"lot" + 0.006*"500" + 0.006*"home" + 0.006*"phone" + 0.006*"350" + 0.006*"house" + 0.005*"ave"

Topic: 2 
Words: 0.009*"one" + 0.006*"said" + 0.005*"would" + 0.005*"time" + 0.004*"man" + 0.004*"day" + 0.004*"many" + 0.003*"say" + 0.003*"way" + 0.003*"two"

Topic: 3 
Words: 0.010*"home" + 0.007*"john" + 0.005*"church" + 0.005*"son" + 0.005*"william" + 0.005*"school" + 0.005*"street" + 0.005*"member" + 0.005*"miss" + 0.004*"club"

Topic: 4 
Words: 0.009*"state" + 0.009*"said" + 0.006*"year" + 0.005*"new" + 0.005*"would" + 0.005*"today" + 0.005*"president" + 0.004*"district" + 0.004*"may" + 0.004*"committee"

--------------------


In [8]:
# Step 1: Prepare the corpus from 'prepared_text', ensuring all entries are treated as strings
prepared_texts = entire_df_loaded['prepared_text'].astype(str).tolist()
articles_corpus = [unguided_dictionary.doc2bow(str(doc).split()) for doc in prepared_texts]

# Proceed with Steps 2 to 4 as before
# Step 2: Get the topic distribution for each document
topic_distributions = [unguided_lda_model.get_document_topics(bow) for bow in articles_corpus]

# Step 3: Find the dominant topic for each document
dominant_topics = [max(dist, key=lambda x: x[1])[0] for dist in topic_distributions]

# Step 4: Assign the dominant topics to the DataFrame
entire_df_loaded['dominant_topic'] = dominant_topics

In [9]:
entire_df_loaded[['prepared_text', 'dominant_topic']].head(100)

Unnamed: 0,prepared_text,dominant_topic
0,dark name lockmon jansen giont coach san franc...,0
1,faloney figure swap etcheverry deal rock conod...,0
2,home ice help chicago gain lead leoque there p...,0
3,chamberlain fire 4o la wilt chamberlain keep s...,0
4,raped home run first time faced major league p...,0
...,...,...
95,gourmet salad dressing u cup finely chopped gr...,0
96,2 3ounce package cream cheese teaspoon grated ...,0
97,new improved crystal clear food wrap thats eas...,1
98,refreshing change highway departmentpar serv i...,4


In [10]:
entire_df_loaded.to_excel("unguided_lda_dominant_topic_classification.xlsx", index=False)