In [None]:
# -----------------------------
# Imports
# -----------------------------
import sys, pathlib
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# if eda_functions.py is in project_root/src:
sys.path.insert(0, str(pathlib.Path.cwd().parent))            # adds project_root
# if eda_functions.py is in the same folder as this notebook:
# sys.path.insert(0, str(pathlib.Path.cwd()))

# Add src folder to path if needed
workspace_path = os.getcwd()
src_path = os.path.join(workspace_path, "src")
if src_path not in sys.path:
    sys.path.append(src_path)

# Import your EDA functions

# For better plots
sns.set(style="whitegrid")
%matplotlib inline

# -----------------------------
# 1. Function: Publication frequency over time
# -----------------------------
def plot_publication_trends(df, date_col='date', freq='D', title="Publication Frequency Over Time"):
    """
    Plots the number of articles over time.
    
    freq: 'D' = daily, 'W' = weekly, 'M' = monthly
    """
    df = df.copy()
    df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
    df = df.dropna(subset=[date_col])
    
    counts = df.groupby(pd.Grouper(key=date_col, freq=freq)).size()
    
    plt.figure(figsize=(15,5))
    counts.plot(marker='o')
    plt.title(title)
    plt.xlabel('Time')
    plt.ylabel('Number of Articles')
    plt.tight_layout()
    plt.show()
    
    return counts


## Publication times-of-day analysis

In [None]:


# -----------------------------
# 2. Function: Publication times-of-day analysis
# -----------------------------
def plot_publication_time_of_day(df, date_col='date', title="Publication Times of Day"):
    """
    Plots distribution of article publication times (hour of day)
    """
    df = df.copy()
    df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
    df = df.dropna(subset=[date_col])
    
    # Extract hour of day
    df['hour'] = df[date_col].dt.hour
    
    plt.figure(figsize=(12,4))
    sns.countplot(x='hour', data=df, color='skyblue')
    plt.title(title)
    plt.xlabel('Hour of Day')
    plt.ylabel('Number of Articles')
    plt.tight_layout()
    plt.show()
    
    # Return a Series with counts per hour
    return df['hour'].value_counts().sort_index()


## Analysis of publishing times

In [None]:

# -----------------------------
# 3. Example workflow
# -----------------------------
if __name__ == "__main__":
    # Load and clean your processed data
    processed_file = r"D:\Python\Week-1\Data-Week-1\processed_analyst_ratings.csv"
    df = pd.read_csv(processed_file)
    
    # 1. Publication frequency over time (daily)
    daily_counts = plot_publication_trends(df, freq='D')
    
    # Optionally, weekly or monthly for smoother trends
    weekly_counts = plot_publication_trends(df, freq='W', title="Weekly Publication Frequency")
    monthly_counts = plot_publication_trends(df, freq='M', title="Monthly Publication Frequency")
    
    # 2. Publication times-of-day
    hourly_counts = plot_publication_time_of_day(df)
