<a href="https://colab.research.google.com/github/MonaFaghfouri/Topic_Modeling/blob/main/Topic_Modeling_by_Time.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install compatible versions of required libraries
!pip install --upgrade --force-reinstall numpy==1.23.5 pandas==1.5.3 gensim openpyxl -q

In [None]:
# monthly_keywords_with_weights: Extracting and weighting keywords by month

In [None]:
from google.colab import files
uploaded = files.upload()

import pandas as pd
import numpy as np
from datetime import datetime
from collections import Counter, defaultdict
from gensim import corpora
from gensim.models import LdaModel
import ast
import re

# Load data
df = pd.read_excel(next(iter(uploaded)))

# Convert token column from string to list
df["Tokens"] = df.iloc[:, 1].apply(ast.literal_eval)

# Convert date column to datetime
dates = pd.to_datetime(df.iloc[:, 2], errors='coerce')
df["Month"] = dates.dt.strftime("%B")

# Define month order
custom_month_order = [
    "April", "May", "June", "July", "August", "September",
    "October", "November", "December", "January", "February"
]

# Function to clean tokens
def clean_tokens(tokens):
    cleaned = []
    for token in tokens:
        token = token.lower()
        token = re.sub(r'\W+', '', token)
        if len(token) > 2 and token.isalpha():
            cleaned.append(token)
    return cleaned

# Monthly LDA topic modeling
month_top_keywords = []

for month in custom_month_order:
    month_df = df[df["Month"] == month]
    if not month_df.empty:
        month_tokens = month_df["Tokens"].apply(clean_tokens).tolist()
        month_tokens = [tokens for tokens in month_tokens if tokens]

        dictionary = corpora.Dictionary(month_tokens)
        dictionary.filter_extremes(no_below=2, no_above=0.8)
        corpus = [dictionary.doc2bow(tokens) for tokens in month_tokens]

        if len(dictionary) == 0 or all(len(doc) == 0 for doc in corpus):
            continue

        lda_model = LdaModel(
            corpus=corpus,
            id2word=dictionary,
            num_topics=5,
            random_state=42,
            passes=15,
            iterations=100,
            minimum_probability=0.01
        )

        # Aggregate keyword weights across all topics
        keyword_weights = defaultdict(float)

        for topic_id in range(lda_model.num_topics):
            topic_terms = lda_model.show_topic(topic_id, topn=10)
            for word, weight in topic_terms:
                keyword_weights[word] += weight

        # Normalize weights (optional, can be removed)
        total_weight = sum(keyword_weights.values())
        normalized_keywords = {
            word: round(weight / total_weight, 4) if total_weight else 0
            for word, weight in keyword_weights.items()
        }

        sorted_keywords = sorted(normalized_keywords.items(), key=lambda x: x[1], reverse=True)
        formatted_keywords = [f"{word} {weight}" for word, weight in sorted_keywords[:10]]

        dominant_keyword = sorted_keywords[0][0] if sorted_keywords else ""

        month_top_keywords.append({
            "Month": month,
            "Top Keywords": ", ".join(formatted_keywords),
            "Dominant Keyword": dominant_keyword
        })

# Save results
result_df = pd.DataFrame(month_top_keywords)
output_filename = "monthly_keywords_with_weights.xlsx"
result_df.to_excel(output_filename, index=False)

# Download result
files.download(output_filename)


In [None]:
# monthly_peakday_topics_LDA_weighted: Extracting LDA-weighted topics for monthly peak days

In [None]:
import pandas as pd
import re
import ast
from collections import Counter
from gensim import corpora
from gensim.models import LdaModel
from google.colab import files

# Upload file
print("Please upload your Excel file:")
uploaded = files.upload()
df = pd.read_excel(next(iter(uploaded)))

# Convert token strings to list
df["Tokens"] = df.iloc[:, 1].apply(ast.literal_eval)

# Convert date to datetime
df["datetime"] = pd.to_datetime(df.iloc[:, 2], errors='coerce')
df["date"] = df["datetime"].dt.date
df["month"] = df["datetime"].dt.strftime('%Y-%m')

# Token cleaning function
def clean_tokens(tokens):
    cleaned = []
    for token in tokens:
        token = token.lower()
        token = re.sub(r'\W+', '', token)
        if len(token) > 2 and token.isalpha():
            cleaned.append(token)
    return cleaned

# Find the peak day of each month
monthly_peaks = df.groupby("month")["date"].apply(lambda x: x.value_counts().idxmax())

# Topic modeling for each peak day
results = []

for month, peak_day in monthly_peaks.items():
    peak_df = df[df["date"] == peak_day]
    tokens_list = peak_df["Tokens"].apply(clean_tokens).tolist()
    tokens_list = [t for t in tokens_list if t]

    if len(tokens_list) == 0:
        continue

    # Build dictionary and remove rare/common tokens
    dictionary = corpora.Dictionary(tokens_list)
    dictionary.filter_extremes(no_below=2, no_above=0.8)
    corpus = [dictionary.doc2bow(tokens) for tokens in tokens_list]

    if len(dictionary) == 0 or all(len(doc) == 0 for doc in corpus):
        continue

    # Train LDA model
    lda_model = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=5,
        random_state=42,
        passes=15,
        iterations=100,
        minimum_probability=0.01
    )

    # Extract keywords with their normalized weights
    keyword_counter = Counter()
    for topic_id in range(lda_model.num_topics):
        words = lda_model.show_topic(topic_id, topn=10)
        keyword_counter.update(dict(words))

    total_score = sum(keyword_counter.values())
    if total_score == 0:
        continue

    # Normalize and sort keywords
    normalized_keywords = [(word, round(score / total_score, 3)) for word, score in keyword_counter.most_common(10)]
    formatted_keywords = ", ".join([f"{word} {score}" for word, score in normalized_keywords])
    dominant = normalized_keywords[0][0] if normalized_keywords else "Unknown"

    results.append((month, formatted_keywords, dominant))

# Save results
result_df = pd.DataFrame(results, columns=["Month", "Top 10 Keywords (with weights)", "Dominant Keyword"])
output_file = "monthly_peakday_topics_LDA_weighted.xlsx"
result_df.to_excel(output_file, index=False)

# Download file
files.download(output_file)