In [None]:
mport sys
from pathlib import Path

project_root = Path("..").resolve()
src_path = project_root / "src"

sys.path.append(str(project_root))
sys.path.append(str(src_path))


In [None]:

from pathlib import Path
import logging
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from fns_project.data.news_pipeline import (
    NewsLoader,
    NewsPreprocessor,
    NewsAggregator,
    NewsAnalysisService,
)

from fns_project.data.preprocess import add_headline_metrics
from fns_project.config import RAW_DIR
from src.fns_project.analysis.eda import (
    headline_length_stats,
    count_articles_per_publisher,
    publication_trend_by_date,
    publication_trend_by_time,
    extract_top_keywords,
    extract_publisher_domains
)
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)

In [None]:
# Cell 3 â€” Instantiate loader + preprocessor (no aligner; Task 1 doesn't require price data)
NEWS_FILE = RAW_DIR / "news/raw_analyst_ratings.csv"  # use RAW_DIR from config
loader = NewsLoader(path=NEWS_FILE, date_col="date", tz="Etc/GMT+4")

# Preprocessor
preprocessor = NewsPreprocessor(
    text_col="headline",
    remove_stopwords=True,
    lemmatize=True  # optional: enable lemmatization
)

# Service without aligner or aggregator
service = NewsAnalysisService(
    loader=loader,
    preprocessor=preprocessor,
    aligner=None,
    aggregator=None
)

print("News file being used:", NEWS_FILE)

In [None]:
df_news = service.run_pipeline(preprocess=True, align=False, aggregate=False)
print("Processed News DataFrame:")
display(df_news.head())

In [None]:
df_news_metrics = add_headline_metrics(df_news, text_col="headline")
display(df_news_metrics.head())

In [None]:
# Headline length and word count stats
stats = headline_length_stats(df_news_metrics)
print("Descriptive stats for headlines:")
print(stats)

In [None]:
publisher_counts = count_articles_per_publisher(
    df_news_metrics, col="publisher")
print("Top publishers by article count:")
print(publisher_counts.head(10))

# Optional bar plot
plt.figure(figsize=(10, 5))
sns.barplot(x="article_count", y="publisher",
            data=publisher_counts.head(20), palette="viridis")
plt.title("Top 20 Active Publishers")
plt.show()

In [None]:
# Use the same column name that the function returns
trend_by_date = publication_trend_by_date(
    df_news_metrics, date_col='date_naive')

# The returned column name will match date_col; rename for convenience
trend_by_date = trend_by_date.rename(columns={'date_naive': 'date'})

plt.figure(figsize=(12, 5))
plt.plot(trend_by_date["date"], trend_by_date["article_count"], marker='o')
plt.title("Number of Articles Over Time")
plt.xlabel("Date")
plt.ylabel("Article Count")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

In [None]:
trend_by_hour = publication_trend_by_time(
    df_news_metrics, date_col='date_naive')

plt.figure(figsize=(10, 4))
sns.barplot(x="hour", y="article_count",
            data=trend_by_hour, palette="coolwarm")
plt.title("Article Counts by Hour")
plt.xlabel("Hour of Day")
plt.ylabel("Article Count")
plt.show()

In [None]:
top_keywords = extract_top_keywords(
    df_news_metrics, text_col="headline", top_n=20, max_features=2000, min_df=3)
print("Top 20 keywords in headlines:")
print(top_keywords)

plt.figure(figsize=(10, 5))
sns.barplot(x="count", y="keyword", data=top_keywords, palette="magma")
plt.title("Top 20 Headline Keywords")
plt.show()

In [None]:
omains = extract_publisher_domains(df_news_metrics, col="publisher")
print("Top publisher domains:")
print(domains.head(10))

plt.figure(figsize=(10, 5))
sns.barplot(x="count", y="domain", data=domains.head(15), palette="cubehelix")
plt.title("Most Frequent Publisher Domains")
plt.show()