In [None]:
# -----------------------------
# Imports
# -----------------------------
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add src folder to path if needed
workspace_path = os.getcwd()
src_path = os.path.join(workspace_path, "src")
if src_path not in sys.path:
    sys.path.append(src_path)

# Import your EDA functions

# Plot settings
sns.set(style="whitegrid")
%matplotlib inline

# -----------------------------
# 1. Function: Publisher article counts
# -----------------------------
def plot_top_publishers(df, top_n=20, title="Top Publishers by Article Count"):
    """
    Plots the most active publishers.
    """
    df = df.copy()
    counts = df['publisher'].value_counts().head(top_n)
    
    plt.figure(figsize=(12,5))
    sns.barplot(x=counts.values, y=counts.index, palette="Blues_d")
    plt.title(title)
    plt.xlabel('Number of Articles')
    plt.ylabel('Publisher')
    plt.tight_layout()
    plt.show()
    
    return counts

# -----------------------------
# 2. Function: Analyze publisher domains if emails are used
# -----------------------------
def analyze_email_publishers(df):
    """
    Detect email addresses used as publisher names, extract domains, and count articles per domain.
    """
    df = df.copy()
    
    # Regex to detect emails
    email_mask = df['publisher'].str.contains(r'^[^@]+@[^@]+\.[^@]+$', regex=True, na=False)
    email_df = df[email_mask].copy()
    
    if email_df.empty:
        print("No email addresses detected as publishers.")
        return None
    
    # Extract domains
    email_df['domain'] = email_df['publisher'].str.extract(r'@(.+)$')
    
    domain_counts = email_df['domain'].value_counts()
    
    plt.figure(figsize=(12,5))
    sns.barplot(x=domain_counts.values[:20], y=domain_counts.index[:20], palette="Greens_d")
    plt.title("Top Email Domains by Article Count")
    plt.xlabel("Number of Articles")
    plt.ylabel("Domain")
    plt.tight_layout()
    plt.show()
    
    return domain_counts

# -----------------------------
# 3. Optional: Explore types of news by publisher
# -----------------------------
def news_by_publisher(df, publisher_name, top_n_words=20):
    """
    Quick analysis of most common words in headlines for a given publisher.
    """
    df = df.copy()
    text = df[df['publisher'] == publisher_name]['headline'].dropna().astype(str)
    
    if text.empty:
        print(f"No headlines found for publisher {publisher_name}.")
        return None
    
    from sklearn.feature_extraction.text import CountVectorizer
    
    vectorizer = CountVectorizer(stop_words='english', max_features=1000)
    X = vectorizer.fit_transform(text)
    words = vectorizer.get_feature_names_out()
    counts = X.sum(axis=0)
    word_counts = sorted(zip(words, counts), key=lambda x: x[1], reverse=True)
    
    return word_counts[:top_n_words]

# -----------------------------
# 4. Example workflow
# -----------------------------
if __name__ == "__main__":
    # Load processed data
    processed_file = r"D:\Python\Week-1\Data-Week-1\processed_analyst_ratings.csv"
    df = pd.read_csv(processed_file)
    
    # 1. Top publishers
    top_publishers = plot_top_publishers(df, top_n=20)
    print("\nTop Publishers:\n", top_publishers)
    
    # 2. Analyze email publishers
    email_domains = analyze_email_publishers(df)
    if email_domains is not None:
        print("\nTop Email Domains:\n", email_domains.head(20))
    
    # 3. Example: Check common words for a specific publisher
    example_publisher = top_publishers.index[0]  # most active publisher
    common_words = news_by_publisher(df, example_publisher, top_n_words=15)
    print(f"\nMost common words in headlines for '{example_publisher}':\n", common_words)
