In [None]:
# Install necessary libraries
pip install gensim pandas pyLDAvis

import pandas as pd  # Import pandas for data manipulation
from gensim import corpora, models  # Import gensim for topic modeling
from gensim.models.coherencemodel import CoherenceModel  # Import CoherenceModel for evaluating topic coherence
import nltk  # Import nltk for natural language processing
from nltk.stem import WordNetLemmatizer  # Import lemmatizer from nltk
from nltk.corpus import wordnet, stopwords  # Import wordnet and stopwords from nltk
import pyLDAvis  # Import pyLDAvis for visualizing LDA models
import pyLDAvis.gensim_models as gensimvis  # Import gensimvis for integrating LDA visualization with pyLDAvis

# Download necessary NLTK resources
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
custom_stop_words = { }
stop_words = set(stopwords.words('english'))
stop_words.update(custom_stop_words)

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def preprocess(text):
    # Tokenize, lemmatize, and remove stop words
    return [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in text.lower().split() if w not in stop_words]

# Load the datasets
df1 = pd.read_csv('dataset_path_for_topic_modeling')
df2 = pd.read_csv('dataset_path_for_topic_weight')

# Preprocessing: Apply tokenization, lemmatization, and stopword removal
df1['processed'] = df1['line'].apply(preprocess)
df2['processed'] = df2['line'].apply(preprocess)

# Create a dictionary and corpus for topic modeling
dictionary = corpora.Dictionary(df1['processed'])

# Filter out words that appear too frequently or too rarely
dictionary.filter_extremes(no_below=32)

# Create a bag-of-words corpus
corpus = [dictionary.doc2bow(text) for text in df1['processed']]

# Optimizing hyperparameters
num_topics = 19  # Adjust as needed
passes = 20
iterations = 560
alpha = 'auto'  # or you can use a specific value like 'symmetric', 'asymmetric', or a fixed number
eta = 'auto'  # or a specific value
random_seed = 100  # Random seed for reproducibility

# Train the LDA model with optimized hyperparameters
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,
                            passes=passes, iterations=iterations, alpha=alpha, eta=eta, random_state=random_seed)

# Evaluate topic coherence
coherence_model_lda = CoherenceModel(model=lda_model, texts=df1['processed'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Topic Coherence: {coherence_lda}')

# Display topics
for idx, topic in lda_model.print_topics(-1, num_words=10):
    print(f'Topic: {idx} \nWords: {topic}\n')

# Analyze topic weight in the second dataset
corpus2 = [dictionary.doc2bow(text) for text in df2['processed']]
topic_weights = [lda_model[doc] for doc in corpus2]

# Example: Print topic weights for the first document in the second dataset
print(f'Topic weights for the first document: {topic_weights[0]}')

# Create a DataFrame from the topic weights
topic_weights_df = pd.DataFrame([
    {**{'Filename': filename, 'WordCount': wordcount}, **{f'Topic {topic}': weight for topic, weight in doc}}
    for filename, wordcount, doc in zip(df2['filename'], df2['wordcount'], topic_weights)
])

# Fill NaN values with 0 (indicating no weight/absence in that document)
topic_weights_df = topic_weights_df.fillna(0)

# Optional: Renaming columns to a more readable format
topic_weights_df.columns = ['Filename', 'WordCount'] + [f'Topic {col}' if col.startswith('Topic') else col for col in topic_weights_df.columns[2:]]

# Display the DataFrame
print(topic_weights_df.head())

# Save the DataFrame to CSV if needed
topic_weights_df.to_csv('file_path', index=False)

# Analyze topic weight in the first dataset
topic_weights_sample = [lda_model[doc] for doc in corpus]

# Example: Print topic weights for the first document in the first dataset
print(f'Topic weights for the first document: {topic_weights_sample[0]}')

# Create a DataFrame from the topic weights
topic_weights_df2 = pd.DataFrame([
    {**{'Filename': line}, **{f'Topic {topic}': weight for topic, weight in doc}}
    for line, doc in zip(df1['line'], topic_weights_sample)
])

# Fill NaN values with 0 (indicating no weight/absence in that document)
topic_weights_df2 = topic_weights_df2.fillna(0)

# Optional: Renaming columns to a more readable format
# topic_weights_df2.columns = ['Filename'] + [f'Topic {col}' if col.startswith('Topic') else col for col in topic_weights_df2.columns[:]]

# Display the DataFrame
print(topic_weights_df2.head())

# Save the DataFrame to CSV if needed
topic_weights_df2.to_csv('file_path', index=False)

# Prepare the visualization
pyLDAvis.enable_notebook()
lda_vis = gensimvis.prepare(lda_model, corpus, dictionary)

# Save the visualization as an HTML file
pyLDAvis.save_html(lda_vis, 'file_path')