In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 2. **Topic Modeling**



In [8]:
import gensim
from gensim import corpora
from nltk.corpus import stopwords
import nltk
from joblib import Parallel, delayed
import re

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# Load stopwords
stop_words = set(stopwords.words('english'))

# Define preprocessing function
def preprocess_for_lda(text):
    # Handle non-string entries by converting to empty string if necessary
    if not isinstance(text, str):
        text = ''
    
    # Tokenize and remove stopwords
    tokens = text.lower().split()
    tokens = [word for word in tokens if word not in stop_words]
    
    # Optionally, clean tokens further (remove punctuation, etc.)
    tokens = [re.sub(r'\W+', '', token) for token in tokens]  # Remove special characters
    return tokens

# Parallelize the preprocessing step in chunks for better progress tracking
def process_texts_parallel(texts, chunk_size=100000):
    num_cores = -1  # Use all available cores
    total_texts = len(texts)
    processed_texts = []
    
    print("Starting preprocessing...")
    for start in range(0, total_texts, chunk_size):
        end = min(start + chunk_size, total_texts)
        print(f"Processing chunk {start} to {end} of {total_texts}")
        
        # Parallel processing of each chunk
        chunk_processed = Parallel(n_jobs=num_cores)(
            delayed(preprocess_for_lda)(text) for text in texts[start:end]
        )
        
        # Append the processed chunk to the result list
        processed_texts.extend(chunk_processed)
    
    print("Preprocessing completed.")
    return processed_texts

# Apply preprocessing
df_cleaned_rating['tokens'] = process_texts_parallel(df_cleaned_rating['Lemmatized review/text'])

df_cleaned_rating[['Lemmatized review/text', 'tokens']].head()

Starting preprocessing...
Processing chunk 0 to 100000 of 2972448
Processing chunk 100000 to 200000 of 2972448
Processing chunk 200000 to 300000 of 2972448
Processing chunk 300000 to 400000 of 2972448
Processing chunk 400000 to 500000 of 2972448
Processing chunk 500000 to 600000 of 2972448
Processing chunk 600000 to 700000 of 2972448
Processing chunk 700000 to 800000 of 2972448
Processing chunk 800000 to 900000 of 2972448
Processing chunk 900000 to 1000000 of 2972448
Processing chunk 1000000 to 1100000 of 2972448
Processing chunk 1100000 to 1200000 of 2972448
Processing chunk 1200000 to 1300000 of 2972448
Processing chunk 1300000 to 1400000 of 2972448
Processing chunk 1400000 to 1500000 of 2972448
Processing chunk 1500000 to 1600000 of 2972448
Processing chunk 1600000 to 1700000 of 2972448
Processing chunk 1700000 to 1800000 of 2972448
Processing chunk 1800000 to 1900000 of 2972448
Processing chunk 1900000 to 2000000 of 2972448
Processing chunk 2000000 to 2100000 of 2972448
Processing 

Unnamed: 0,Lemmatized review/text,tokens
0,julie strain fans collection photo page worth ...,"[julie, strain, fans, collection, photo, page,..."
1,do not care much dr seuss reading philip nel b...,"[care, much, dr, seuss, reading, philip, nel, ..."
2,people become book read child father man dr se...,"[people, become, book, read, child, father, ma..."
3,theodore seuss geisel aka quotdr seussquot one...,"[theodore, seuss, geisel, aka, quotdr, seussqu..."
4,philip nel dr seuss american iconthis basicall...,"[philip, nel, dr, seuss, american, iconthis, b..."


In [None]:
from gensim import corpora

# Create a dictionary representation of the documents (tokens)
dictionary = corpora.Dictionary(df_cleaned_rating['tokens'])

# Optionally, filter extremes to remove very common or rare tokens
dictionary.filter_extremes(no_below=5, no_above=0.5)  # Adjust thresholds as needed

In [11]:
# Save dictionary for future use
dictionary.save('dictionary.dict')

In [12]:
import os

# Define chunk size (number of documents per chunk)
chunk_size = 100000  # Adjust based on memory limits

# Initialize an empty list to collect corpus chunks or save to disk directly
corpus_chunks = []

# Create a directory to save the corpus chunks if it doesn't exist
if not os.path.exists('corpus_chunks'):
    os.makedirs('corpus_chunks')

# Process the dataset in chunks
for start in range(0, len(df_cleaned_rating), chunk_size):
    end = min(start + chunk_size, len(df_cleaned_rating))
    
    print(f"Processing chunk {start} to {end}")
    
    # Create bag-of-words representation (doc2bow) for the current chunk
    corpus_chunk = [dictionary.doc2bow(tokens) for tokens in df_cleaned_rating['tokens'][start:end]]
    
    # Save the chunk to disk
    chunk_filename = f'corpus_chunks/corpus_chunk_{start // chunk_size}.mm'
    corpora.MmCorpus.serialize(chunk_filename, corpus_chunk)
    
    print(f"Saved corpus chunk {start} to {end} to {chunk_filename}")

print("Corpus creation in chunks completed.")

Processing chunk 0 to 100000
Saved corpus chunk 0 to 100000 to corpus_chunks/corpus_chunk_0.mm
Processing chunk 100000 to 200000
Saved corpus chunk 100000 to 200000 to corpus_chunks/corpus_chunk_1.mm
Processing chunk 200000 to 300000
Saved corpus chunk 200000 to 300000 to corpus_chunks/corpus_chunk_2.mm
Processing chunk 300000 to 400000
Saved corpus chunk 300000 to 400000 to corpus_chunks/corpus_chunk_3.mm
Processing chunk 400000 to 500000
Saved corpus chunk 400000 to 500000 to corpus_chunks/corpus_chunk_4.mm
Processing chunk 500000 to 600000
Saved corpus chunk 500000 to 600000 to corpus_chunks/corpus_chunk_5.mm
Processing chunk 600000 to 700000
Saved corpus chunk 600000 to 700000 to corpus_chunks/corpus_chunk_6.mm
Processing chunk 700000 to 800000
Saved corpus chunk 700000 to 800000 to corpus_chunks/corpus_chunk_7.mm
Processing chunk 800000 to 900000
Saved corpus chunk 800000 to 900000 to corpus_chunks/corpus_chunk_8.mm
Processing chunk 900000 to 1000000
Saved corpus chunk 900000 to 1

In [13]:
from gensim.models.ldamodel import LdaModel
from gensim.corpora import Dictionary
import os

# Define the number of topics you want to extract
num_topics = 5  # Adjust this based on your needs

# Initialize the LDA model without training it yet
lda_model = LdaModel(id2word=dictionary, 
                     num_topics=num_topics, 
                     random_state=42, 
                     alpha='auto', 
                     eta='auto')

# Load and process the corpus chunks incrementally
for chunk_file in sorted(os.listdir('corpus_chunks')):
    if chunk_file.endswith('.mm'):
        print(f"Processing {chunk_file} for LDA training...")
        
        # Load the chunk
        chunk_corpus = corpora.MmCorpus(os.path.join('corpus_chunks', chunk_file))
        
        # Update the LDA model with the chunk
        lda_model.update(chunk_corpus)
        
        print(f"Updated model with {chunk_file}")

# After processing all chunks, save the trained LDA model
lda_model.save('lda_model_trained.model')
print("LDA model training completed and saved.")

# Print the top 10 words for each topic
for i, topic in lda_model.print_topics(num_topics=num_topics, num_words=10):
    print(f"Topic {i+1}: {topic}")

Processing corpus_chunk_0.mm for LDA training...
Updated model with corpus_chunk_0.mm
Processing corpus_chunk_1.mm for LDA training...
Updated model with corpus_chunk_1.mm
Processing corpus_chunk_10.mm for LDA training...
Updated model with corpus_chunk_10.mm
Processing corpus_chunk_11.mm for LDA training...
Updated model with corpus_chunk_11.mm
Processing corpus_chunk_12.mm for LDA training...
Updated model with corpus_chunk_12.mm
Processing corpus_chunk_13.mm for LDA training...
Updated model with corpus_chunk_13.mm
Processing corpus_chunk_14.mm for LDA training...
Updated model with corpus_chunk_14.mm
Processing corpus_chunk_15.mm for LDA training...
Updated model with corpus_chunk_15.mm
Processing corpus_chunk_16.mm for LDA training...
Updated model with corpus_chunk_16.mm
Processing corpus_chunk_17.mm for LDA training...
Updated model with corpus_chunk_17.mm
Processing corpus_chunk_18.mm for LDA training...
Updated model with corpus_chunk_18.mm
Processing corpus_chunk_19.mm for LD

In [14]:
# Load the trained LDA model
lda_model = LdaModel.load('lda_model_trained.model')

# Define a function to assign the dominant topic
def assign_dominant_topic(lda_model, corpus):
    topics = [max(lda_model[doc], key=lambda x: x[1])[0] for doc in corpus]
    return topics

# Load the corpus chunks
corpus_chunks = [corpora.MmCorpus(os.path.join('corpus_chunks', file)) for file in sorted(os.listdir('corpus_chunks')) if file.endswith('.mm')]

# Initialize an empty list to collect dominant topics
dominant_topics = []

for chunk_corpus in corpus_chunks:
    dominant_topics.extend(assign_dominant_topic(lda_model, chunk_corpus))

# Add dominant topics to the DataFrame
df_cleaned_rating['Dominant_Topic'] = dominant_topics

# Save the DataFrame with dominant topics
df_cleaned_rating.to_csv('amazon_books_reviews_with_topics.csv', index=False)
print("Dominant topics assigned and saved to DataFrame.")

Dominant topics assigned and saved to DataFrame.


In [None]:
from gensim import corpora
import os
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# Combine all corpus chunks into a single corpus
corpus = []
for chunk_file in sorted(os.listdir('corpus_chunks')):
    if chunk_file.endswith('.mm'):
        print(f"Loading {chunk_file} for visualization...")
        chunk_corpus = corpora.MmCorpus(os.path.join('corpus_chunks', chunk_file))
        corpus.extend(chunk_corpus)

# Prepare the visualization
lda_vis = gensimvis.prepare(lda_model, corpus, dictionary)

# Display the visualization
pyLDAvis.display(lda_vis)

Loading corpus_chunk_0.mm for visualization...
Loading corpus_chunk_1.mm for visualization...
Loading corpus_chunk_10.mm for visualization...
Loading corpus_chunk_11.mm for visualization...
Loading corpus_chunk_12.mm for visualization...
Loading corpus_chunk_13.mm for visualization...
Loading corpus_chunk_14.mm for visualization...
Loading corpus_chunk_15.mm for visualization...
Loading corpus_chunk_16.mm for visualization...
Loading corpus_chunk_17.mm for visualization...
Loading corpus_chunk_18.mm for visualization...
Loading corpus_chunk_19.mm for visualization...
Loading corpus_chunk_2.mm for visualization...
Loading corpus_chunk_20.mm for visualization...
Loading corpus_chunk_21.mm for visualization...
Loading corpus_chunk_22.mm for visualization...
Loading corpus_chunk_23.mm for visualization...
Loading corpus_chunk_24.mm for visualization...
Loading corpus_chunk_25.mm for visualization...
