In [7]:
import os  # For operating system dependent functionality
import time  # For timing code execution
import re  # For regular expressions

import pandas as pd  # For data manipulation and analysis
import nltk  # For natural language processing tasks
from nltk.corpus import stopwords  # For stopwords
from nltk.stem import WordNetLemmatizer  # For word lemmatization

# For Gensim models and functionalities
from gensim.models.phrases import Phrases, Phraser
from gensim.models import LdaModel, TfidfModel
from gensim.corpora import Dictionary

import spacy  # For advanced natural language processing

from gensim.models.coherencemodel import CoherenceModel

In [2]:
directory = "D:/2_nlp"
# Define the file names
prepared_file_name_250_MB = "Prepared_last_25_years_250_MB_dataset.xlsx"
# Construct the full file paths
prepared_file_path_250_MB = os.path.join(directory, prepared_file_name_250_MB)

# Read the .xlsx files into DataFrames
prepared_last_25_years_df = pd.read_excel(prepared_file_path_250_MB)

In [3]:
start_time = time.time()

# Ensure all entries in 'prepared_text' are treated as strings
documents = prepared_last_25_years_df['prepared_text'].astype(str).apply(lambda x: x.split()).tolist()

# Continue with filtering out words with less than 3 characters
documents = [[word for word in doc if len(word) > 2] for doc in documents]

end_time = time.time()
print("The total time taken in mins is {}".format(round((end_time - start_time) / 60)))

The total time taken in mins is 0


In [4]:
def apply_bigram_phrase_detection(documents, min_count=2, threshold=5):
    """
    Applies bigram phrase detection to a list of tokenized documents to identify
    and mark multi-word expressions as single tokens.
    
    Parameters:
    - documents: A list of documents, where each document is a list of tokens (words).
    - min_count: The minimum count of phrase occurrences in the corpus to be considered
                 for phrase detection. Phrases appearing less frequently than this
                 threshold will be ignored. (Default: 2)
    - threshold: The scoring threshold for forming phrases. Higher values mean that
                 phrases need to be more frequent and specific to be formed. Lower
                 values allow more phrases to be detected but may include less meaningful ones. (Default: 5)
    
    Returns:
    - documents_with_bigrams: A new list of documents where detected bigrams are
                              represented as single tokens joined by underscores.
    """
    
    # Train the bigram Phrases model based on the input documents
    bigram_phrases = Phrases(documents, min_count=min_count, threshold=threshold)
    
    # Convert the Phrases model to a more efficient Phraser for transforming documents
    bigram = Phraser(bigram_phrases)
    
    # Apply the bigram Phraser to each document to form bigrams
    documents_with_bigrams = [bigram[doc] for doc in documents]
    
    return documents_with_bigrams

# Assuming 'documents' is a list of tokenized documents
start_time = time.time()
documents_with_bigrams = apply_bigram_phrase_detection(documents, min_count=2, threshold=10)
end_time = time.time()
print("The total time taken in mins is {}".format(round((end_time - start_time) / 60)))

The total time taken in mins is 4


## Named Entity Recognition Emphasizing

In [5]:
# Load SpaCy's NLP model for Named Entity Recognition (NER)
nlp = spacy.load("en_core_web_sm")

def emphasize_ner(document, nlp_model):
    """
    Processes a document to identify and emphasize named entities using underscores,
    while retaining non-entity tokens in their original form. The function creates a new
    document representation where named entities are highlighted as single tokens,
    facilitating their recognition in NLP tasks like topic modeling.
    
    Parameters:
    - document: A list of words (tokens) constituting the original document.
                The document should be pre-tokenized.
    - nlp_model: A SpaCy NLP model used for Named Entity Recognition (NER).
                 This model identifies the named entities in the document.
    
    Returns:
    - A list of tokens where named entities are emphasized by joining their constituent
      words with underscores, and non-entity tokens are included as is.
    """
    # Convert the list of tokens back into a string for NER processing with SpaCy
    processed_text = nlp_model(" ".join(document))
    
    new_doc = []  # Initialize an empty list to hold the processed tokens
    idx = 0  # Index to keep track of our position in the processed_text
    
    # Iterate over the tokens in the processed text
    while idx < len(processed_text):
        if processed_text[idx].ent_iob != 0:  # If the token is part of an entity
            entity = processed_text[idx].ent_type_  # Get the entity type
            start = idx  # Mark the start of the entity
            # Continue until we've processed all tokens belonging to this entity
            while idx < len(processed_text) and processed_text[idx].ent_type_ == entity:
                idx += 1
            # Combine the tokens of the named entity with underscores
            entity_text = '_'.join([processed_text[i].text for i in range(start, idx)])
            new_doc.append(entity_text)  # Add the emphasized entity to new_doc
        else:
            # If the token is not part of an entity, add it as is
            new_doc.append(processed_text[idx].text)
            idx += 1  # Move to the next token

    return new_doc

# Apply Named Entity Recognition (NER) to emphasize entities in documents
start_time = time.time()
documents_with_ner = [emphasize_ner(doc, nlp) for doc in documents_with_bigrams]
end_time = time.time()
print("The total time taken in mins is {}".format(round((end_time - start_time) / 60)))

The total time taken in mins is 212


## Unguided LDA with NER emphasizing, Bigrams and TF-IDF

In [6]:
def train_lda_model(documents, num_topics, no_below, no_above, total_passes, random_state, low_value):
    """
    Trains an LDA model using documents that have been preprocessed, including phrase detection and NER.
    
    Parameters:
    - documents: List of preprocessed documents, each represented as a list of tokens.
    - num_topics: The desired number of topics.
    - no_below: Filter out tokens that appear in fewer than 'no_below' documents.
    - no_above: Filter out tokens that appear in more than 'no_above' proportion of documents.
    - total_passes: Number of passes through the corpus during training.
    - random_state: Seed for reproducibility.
    
    Returns:
    - lda_model: The trained LDA model.
    - dictionary: Gensim dictionary created from the documents.
    - corpus: Document-term matrix used for LDA training.
    """
    
    # Create a dictionary and filter extremes
    dictionary = Dictionary(documents)
    dictionary.filter_extremes(no_below=no_below, no_above=no_above)
    
    # Create the Document-Term Matrix
    corpus = [dictionary.doc2bow(doc) for doc in documents]
    
    # Apply TF-IDF filtering
    tfidf = TfidfModel(corpus)
    tfidf_corpus = [[(id, freq) for id, freq in doc if tfidf.idfs[id] > low_value] for doc in corpus]
    
    # Initialize and train the LDA model
    lda_model = LdaModel(corpus=tfidf_corpus, num_topics=num_topics, id2word=dictionary,
                         passes=total_passes, random_state=random_state)
    
    return lda_model, dictionary, tfidf_corpus

start_time = time.time()
# Train the LDA model with the preprocessed documents
lda_model, dictionary, tfidf_corpus = train_lda_model(documents_with_ner, num_topics=5, no_below=20, no_above=0.2, total_passes=10, random_state=100, low_value=0.01)

# Display the topics in the trained model
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx} \nWords: {topic}\n")

print()
end_time = time.time()
print("The total time taken in mins is {}".format(round((end_time - start_time) / 60)))

Topic: 0 
Words: 0.041*"sunday" + 0.036*"500" + 0.032*"350" + 0.026*"evening" + 0.026*"friday" + 0.017*"thursday" + 0.016*"1885" + 0.016*"100" + 0.014*"1888" + 0.012*"1880"

Topic: 1 
Words: 0.120*"washington" + 0.026*"saturday" + 0.022*"tuesday" + 0.022*"year" + 0.021*"monday" + 0.020*"tomorrow" + 0.019*"virginia" + 0.019*"wednesday" + 0.015*"half" + 0.015*"week"

Topic: 2 
Words: 0.091*"one" + 0.083*"two" + 0.076*"today" + 0.071*"first" + 0.043*"three" + 0.030*"yesterday" + 0.027*"second" + 0.024*"four" + 0.016*"five" + 0.015*"third"

Topic: 3 
Words: 0.067*"american" + 0.036*"tonight" + 0.034*"today" + 0.026*"4000" + 0.022*"german" + 0.020*"one" + 0.020*"french" + 0.019*"british" + 0.019*"france" + 0.018*"london"

Topic: 4 
Words: 0.032*"june" + 0.022*"summer" + 0.020*"october" + 0.019*"maryland" + 0.019*"philadelphia" + 0.019*"april" + 0.018*"august" + 0.018*"september" + 0.017*"germany" + 0.016*"100000"


The total time taken in mins is 21
