<a href="https://colab.research.google.com/github/MonaFaghfouri/Topic_Modeling/blob/main/Topic_Modeling_by_Time.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install compatible versions of required libraries
!pip install --upgrade --force-reinstall numpy==1.23.5 pandas==1.5.3 gensim openpyxl -q

In [None]:
from google.colab import files
uploaded = files.upload()

import pandas as pd
import numpy as np
from datetime import datetime
from collections import Counter
from gensim import corpora
from gensim.models import LdaModel
import ast
import re

# Load data
df = pd.read_excel(next(iter(uploaded)))

# Convert token column from string to list
df["Tokens"] = df.iloc[:, 1].apply(ast.literal_eval)

# Convert date column to datetime
dates = pd.to_datetime(df.iloc[:, 2], errors='coerce')
df["Month"] = dates.dt.strftime("%B")

# Define month order
custom_month_order = [
    "April", "May", "June", "July", "August", "September",
    "October", "November", "December", "January", "February"
]

# Function to clean tokens
def clean_tokens(tokens):
    cleaned = []
    for token in tokens:
        token = token.lower()
        token = re.sub(r'\W+', '', token)  # Remove non-alphanumeric
        if len(token) > 2 and token.isalpha():  # Remove short/non-alpha tokens
            cleaned.append(token)
    return cleaned

# Monthly LDA topic modeling
month_top_keywords = []

for month in custom_month_order:
    month_df = df[df["Month"] == month]
    if not month_df.empty:
        month_tokens = month_df["Tokens"].apply(clean_tokens).tolist()

        # Remove empty token lists
        month_tokens = [tokens for tokens in month_tokens if tokens]

        # Build dictionary and filter extremes
        dictionary = corpora.Dictionary(month_tokens)
        dictionary.filter_extremes(no_below=2, no_above=0.8)  # filter rare/common tokens
        corpus = [dictionary.doc2bow(tokens) for tokens in month_tokens]

        if len(dictionary) == 0 or all(len(doc) == 0 for doc in corpus):
            continue  # Skip if no valid data left after filtering

        # Train LDA model
        lda_model = LdaModel(
            corpus=corpus,
            id2word=dictionary,
            num_topics=5,
            random_state=42,
            passes=15,
            iterations=100,
            minimum_probability=0.01
        )

        # Aggregate keywords from all topics
        keyword_counter = Counter()
        for topic_id in range(lda_model.num_topics):
            topic_keywords = lda_model.show_topic(topic_id, topn=10)
            keyword_counter.update([word for word, _ in topic_keywords])

        top_keywords = [word for word, _ in keyword_counter.most_common(10)]
        dominant_keyword = top_keywords[0] if top_keywords else ""

        month_top_keywords.append({
            "Month": month,
            "Top Keywords": ", ".join(top_keywords),
            "Dominant Keyword": dominant_keyword
        })

# Save results
result_df = pd.DataFrame(month_top_keywords)
output_filename = "monthly_keywords_LDA_Improved.xlsx"
result_df.to_excel(output_filename, index=False)

# Download result
files.download(output_filename)


In [None]:
!pip install arabic_reshaper python-bidi

import pandas as pd
import arabic_reshaper
from bidi.algorithm import get_display
from collections import Counter
from gensim import corpora
from gensim.models import LdaModel
import ast
import re
from google.colab import files

# Upload the Excel file
print("Please upload your Excel file:")
uploaded = files.upload()
df = pd.read_excel(next(iter(uploaded)))

# Convert token column from string to list
df["Tokens"] = df.iloc[:, 1].apply(ast.literal_eval)

# Convert date column to datetime
df["datetime"] = pd.to_datetime(df.iloc[:, 2], errors='coerce')
df["date"] = df["datetime"].dt.date
df["month"] = df["datetime"].dt.strftime('%Y-%m')

# Clean tokens function (similar to first script)
def clean_tokens(tokens):
    cleaned = []
    for token in tokens:
        token = token.lower()
        token = re.sub(r'\W+', '', token)
        if len(token) > 2 and token.isalpha():
            cleaned.append(token)
    return cleaned

# Find the peak day for each month
monthly_peaks = df.groupby("month")["date"].apply(lambda x: x.value_counts().idxmax())

# LDA topic modeling for each peak day
results = []

for month, peak_day in monthly_peaks.items():
    peak_df = df[df["date"] == peak_day]
    tokens_list = peak_df["Tokens"].apply(clean_tokens).tolist()
    tokens_list = [t for t in tokens_list if t]  # remove empty

    if len(tokens_list) == 0:
        continue

    # Create dictionary and filter extremes
    dictionary = corpora.Dictionary(tokens_list)
    dictionary.filter_extremes(no_below=2, no_above=0.8)
    corpus = [dictionary.doc2bow(tokens) for tokens in tokens_list]

    if len(dictionary) == 0 or all(len(doc) == 0 for doc in corpus):
        continue

    # Fit LDA model
    lda_model = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=5,
        random_state=42,
        passes=15,
        iterations=100,
        minimum_probability=0.01
    )

    # Collect keywords from all topics
    keyword_counter = Counter()
    for topic_id in range(lda_model.num_topics):
        words = lda_model.show_topic(topic_id, topn=10)
        keyword_counter.update([word for word, _ in words])

    top_10_words = [get_display(arabic_reshaper.reshape(word)) for word, _ in keyword_counter.most_common(10)]
    dominant = top_10_words[0] if top_10_words else "Unknown"

    results.append((month, ", ".join(top_10_words), dominant))

# Save result to Excel file
result_df = pd.DataFrame(results, columns=["Month", "Top 10 Keywords", "Dominant Keyword"])
output_file = "monthly_peakday_topics_LDA.xlsx"
result_df.to_excel(output_file, index=False)

# Download the file
files.download(output_file)


In [None]:
import pandas as pd
import re
import ast
from collections import Counter
from gensim import corpora
from gensim.models import LdaModel
from google.colab import files

# Upload file
print("Please upload your Excel file:")
uploaded = files.upload()
df = pd.read_excel(next(iter(uploaded)))

# Convert token strings to list
df["Tokens"] = df.iloc[:, 1].apply(ast.literal_eval)

# Convert date to datetime
df["datetime"] = pd.to_datetime(df.iloc[:, 2], errors='coerce')
df["date"] = df["datetime"].dt.date
df["month"] = df["datetime"].dt.strftime('%Y-%m')

# Token cleaning function
def clean_tokens(tokens):
    cleaned = []
    for token in tokens:
        token = token.lower()
        token = re.sub(r'\W+', '', token)
        if len(token) > 2 and token.isalpha():
            cleaned.append(token)
    return cleaned

# Find the peak day of each month
monthly_peaks = df.groupby("month")["date"].apply(lambda x: x.value_counts().idxmax())

# Topic modeling for each peak day
results = []

for month, peak_day in monthly_peaks.items():
    peak_df = df[df["date"] == peak_day]
    tokens_list = peak_df["Tokens"].apply(clean_tokens).tolist()
    tokens_list = [t for t in tokens_list if t]

    if len(tokens_list) == 0:
        continue

    # Build dictionary and remove rare/common tokens
    dictionary = corpora.Dictionary(tokens_list)
    dictionary.filter_extremes(no_below=2, no_above=0.8)
    corpus = [dictionary.doc2bow(tokens) for tokens in tokens_list]

    if len(dictionary) == 0 or all(len(doc) == 0 for doc in corpus):
        continue

    # Train LDA model
    lda_model = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=5,
        random_state=42,
        passes=15,
        iterations=100,
        minimum_probability=0.01
    )

    # Extract keywords
    keyword_counter = Counter()
    for topic_id in range(lda_model.num_topics):
        words = lda_model.show_topic(topic_id, topn=10)
        keyword_counter.update([word for word, _ in words])

    top_10_words = [word for word, _ in keyword_counter.most_common(10)]
    dominant = top_10_words[0] if top_10_words else "Unknown"

    results.append((month, ", ".join(top_10_words), dominant))

# Save results
result_df = pd.DataFrame(results, columns=["Month", "Top 10 Keywords", "Dominant Keyword"])
output_file = "monthly_peakday_topics_LDA_no_reshaper.xlsx"
result_df.to_excel(output_file, index=False)

# Download file
files.download(output_file)
