In [None]:
# ------------------------------------------
# Full EDA Notebook: Time Series + Topic Modeling + Publisher Analysis
# ------------------------------------------

# Step 1: Setup paths
import sys
from pathlib import Path
import os

project_root = Path().resolve().parent
src_path = project_root / "src"
sys.path.append(str(src_path))
print("SRC path added:", src_path)

output_dir = project_root / "outputs"
os.makedirs(output_dir, exist_ok=True)
print("Outputs will be saved to:", output_dir)

data_path = project_root / "data" / "preprocessed_data.csv"
print("Using preprocessed data at:", data_path)

# Step 2: Import libraries
import importlib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

# Topic Modeling visualization
import pyLDAvis
import pyLDAvis.sklearn    # pyright: ignore[reportMissingImports]

# Step 3: Reload modules
import time_analysis
import topic_modeling
import publication_analysis

importlib.reload(time_analysis)
importlib.reload(topic_modeling)
importlib.reload(publication_analysis)

from time_analysis import TimeSeriesAnalyzer
from topic_modeling import TopicModeler
from publication_analysis import PublisherAnalyzer

# ------------------------------------------
# Part 1: Time Series Analysis
# ------------------------------------------
print("\n=== Time Series Analysis ===")
ts_analyzer = TimeSeriesAnalyzer(str(data_path))
daily_counts, spikes, hourly_counts = ts_analyzer.run()

# Save CSV outputs
daily_counts.to_csv(output_dir / "daily_article_counts.csv")
hourly_counts.to_csv(output_dir / "hourly_article_counts.csv")

# Daily trend plot
ts_analyzer.plot_daily_trend(spikes=spikes)
plt.savefig(output_dir / "daily_trend.png", bbox_inches='tight')
plt.show()

# Hourly distribution plot
ts_analyzer.plot_hourly_distribution()
plt.savefig(output_dir / "hourly_distribution.png", bbox_inches='tight')
plt.show()

# Weekday-hour heatmap
ts_analyzer.plot_weekday_hour_heatmap()
plt.savefig(output_dir / "weekday_hour_heatmap.png", bbox_inches='tight')
plt.show()

# ------------------------------------------
# Part 2: Topic Modeling
# ------------------------------------------
print("\n=== Topic Modeling ===")
topic_modeler_obj = TopicModeler(
    data_path=str(data_path),
    num_topics=3,
    max_features=1000,
    sample_size=1000  # optional for speed
)
topics = topic_modeler_obj.run()

# Save topics as text
with open(output_dir / "topics.txt", "w") as f:
    f.write(topics)

# Interactive pyLDAvis visualization
lda_vis_data = pyLDAvis.sklearn.prepare(
    topic_modeler_obj.lda_model,
    topic_modeler_obj.doc_term_matrix,
    topic_modeler_obj.vectorizer,
    mds='tsne'
)
pyLDAvis.display(lda_vis_data)

# Save pyLDAvis HTML
pyLDAvis.save_html(lda_vis_data, output_dir / "lda_vis.html")

# ------------------------------------------
# Part 3: Publisher Analysis
# ------------------------------------------
print("\n=== Publisher Analysis ===")
pub_analyzer = PublisherAnalyzer(str(data_path))
publisher_counts, domain_counts, news_type_dist = pub_analyzer.run_full_analysis(top_n=10, type_column="stock")

# Save CSV outputs
publisher_counts.to_csv(output_dir / "publisher_counts.csv", header=True)
domain_counts.to_csv(output_dir / "domain_counts.csv", header=True)
news_type_dist.to_csv(output_dir / "news_type_distribution.csv")

# Save and show top publisher plots
pub_analyzer.plot_top_publishers(top_n=10)
plt.savefig(output_dir / "top_publishers.png", bbox_inches='tight')
plt.show()

# Save and show top domains plots
pub_analyzer.plot_top_domains(top_n=10)
plt.savefig(output_dir / "top_domains.png", bbox_inches='tight')
plt.show()

print("\nâœ… All outputs (CSV, PNG, HTML) are saved in the `outputs/` folder.")
