# Task-1: Exploratory Data Analysis (EDA)- Descriptive Statistics

This notebook performs **exploratory data analysis (EDA)** on the preprocessed news dataset, including:

1. **Data Preprocessing**  
   - Handling missing values  
   - Text-specific cleaning  
   - Date parsing and formatting  

2. **Descriptive Statistics**  
   - Headline length statistics  
   - Publisher activity  
   - Trends over time  

3. **Time Series Analysis**  
   - Daily article frequency  
   - Spike detection for market events  
   - Hourly distribution of news  
   - Weekday-hour heatmap  

4. **Topic Modeling (NLP)**  
   - Cleaning text and removing stopwords  
   - Vectorization with CountVectorizer  
   - Latent Dirichlet Allocation (LDA) for topic extraction  
   - Displaying top words per topic  

5. **Publisher Analysis**  
   - Top contributing publishers  
   - Domains if email addresses are used  
   - Distribution of news type per publisher  

---

## Descriptive Statistics

In [None]:
import sys
from pathlib import Path

# Step 1: Identify project root (parent of notebooks/)
project_root = Path().resolve().parent

# Step 2: Add src/ to import path
src_path = project_root / "src"
sys.path.append(str(src_path))

# Step 3: Import loader
from data_loader import NewsDataLoader

print("Import successful! Using src folder:", src_path)
from pathlib import Path

BASE_DATA_DIR = Path(
    r"D:\Python\Week-1"
    r"\Predicting Price Moves with News Sentiment"
    r"\Predicting-Price-Moves-with-News-Sentiment_Week1-Challenge"
    r"\data"
)

print("Base data directory set to:", BASE_DATA_DIR)

# Initialize
loader = NewsDataLoader(BASE_DATA_DIR)  # pylint: disable=undefined-variable

# Load dataset
df = loader.load("raw_analyst_ratings.csv")
df.head()
df.describe()
df['publisher'].value_counts()
df.info()


In [None]:
from data_preprocess import NewsDataPreprocessor
processor = NewsDataPreprocessor(BASE_DATA_DIR)
df_clean = processor.preprocess(df)
processor.save(df_clean)
df_clean.head()
df_clean.describe()
df_clean['publisher'].value_counts()
df_clean.info()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
# 1.1 Text Length: Create headline length column
df["headline_length"] = df["headline"].astype(str).apply(len)

# Summary statistics
text_stats = df["headline_length"].describe()
print("Headline Length Statistics:\n", text_stats)

# Plot distribution
plt.figure(figsize=(8, 5))
df["headline_length"].plot(kind="hist", bins=40)
plt.title("Distribution of Headline Lengths")
plt.xlabel("Headline Length (characters)")
plt.ylabel("Frequency")
plt.show()


In [None]:
# 1.2 Publisher article count
publisher_counts = df["publisher"].value_counts()

print("\nTop Publishers:\n")
print(publisher_counts.head(20))

# Plot
plt.figure(figsize=(10, 6))
publisher_counts.head(15).plot(kind="bar")
plt.title("Top 15 Most Active Publishers")
plt.xlabel("Publisher")
plt.ylabel("Number of Articles")
plt.xticks(rotation=45, ha="right")
plt.show()



In [None]:
# Ensure date is datetime
df["date"] = pd.to_datetime(df["date"], errors="coerce")

# Daily frequency
daily_counts = df.groupby(df["date"].dt.date).size()

plt.figure(figsize=(12, 6))
daily_counts.plot()
plt.title("Daily News Volume Over Time")
plt.xlabel("Date")
plt.ylabel("Number of Articles")
plt.show()

# Weekly trend
weekly_counts = df.groupby(df["date"].dt.to_period("W")).size()

plt.figure(figsize=(12, 6))
weekly_counts.plot()
plt.title("Weekly News Volume Trend")
plt.xlabel("Week")
plt.ylabel("Number of Articles")
plt.show()

# Monthly trend
monthly_counts = df.groupby(df["date"].dt.to_period("M")).size()

plt.figure(figsize=(12, 6))
monthly_counts.plot()
plt.title("Monthly News Volume Trend")
plt.xlabel("Month")
plt.ylabel("Number of Articles")
plt.show()


## Topic Modelling

In [None]:

import sys
from pathlib import Path

# Project root (parent of notebooks folder)
project_root = Path().resolve().parent

# Add src folder to path
src_path = project_root / "src"
sys.path.append(str(src_path))
print("SRC path added:", src_path)
from topic_modeling import TopicModeler
data_path = project_root / "data" / "preprocessed_data.csv"

modeler = TopicModeler(
    data_path=str(data_path),
    num_topics=4,
    max_features=1000,
    sample_size=1000  # optional for speed
)

topics = modeler.run()


## Time Series-Analysis

In [None]:
# ------------------------------------------
# Time Series Analysis Notebook
# ------------------------------------------

# Step 1: Add src folder to Python path
import sys
from pathlib import Path

project_root = Path().resolve().parent   # parent of notebooks
src_path = project_root / "src"
sys.path.append(str(src_path))

print("SRC path added:", src_path)

# Step 2: Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Restart kernel first (in Jupyter: Kernel -> Restart Kernel)
# Then, use importlib to reload the module
import importlib
import time_analysis  # your module

importlib.reload(time_analysis)  # reloads the updated module

from time_analysis import TimeSeriesAnalyzer

# Optional: Seaborn style
sns.set(style="whitegrid")

# Step 3: Define path to preprocessed data
data_path = project_root / "data" / "preprocessed_data.csv"
print("Using preprocessed data at:", data_path)
# Initialize analyzer
analyzer = TimeSeriesAnalyzer(str(data_path))

# Run the standard analysis
daily_counts, spikes, hourly_counts = analyzer.run()

# Now the heatmap method exists
analyzer.plot_weekday_hour_heatmap()

# Step 5: Optional - Inspect results
print("\n--- Daily Counts (first 10 rows) ---")
print(daily_counts.head(10))

print("\n--- Detected Spikes ---")
print(spikes)

print("\n--- Hourly Distribution ---")
print(hourly_counts)






## Publisher Analysis

In [None]:
# ------------------------------------------
# Publisher Analysis Notebook
# ------------------------------------------

# Step 1: Add src folder to Python path
import sys
from pathlib import Path

# Project root (parent of notebooks folder)
project_root = Path().resolve().parent

# Add src folder to Python path
src_path = project_root / "src"
sys.path.append(str(src_path))
print("SRC path added:", src_path)

# Step 2: Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from publication_analysis import PublisherAnalyzer

# Optional: Seaborn style
sns.set(style="whitegrid")

# Step 3: Define path to preprocessed data
data_path = project_root / "data" / "preprocessed_data.csv"
print("Using preprocessed data at:", data_path)

# Step 4: Initialize PublisherAnalyzer
analyzer = PublisherAnalyzer(str(data_path))

# Step 5: Run full publisher analysis
# This will:
# 1) Count articles per publisher & plot top N
# 2) Extract domains if emails are used & plot top N
# 3) Show news type distribution per publisher (based on 'stock' column)
publisher_counts, domain_counts, news_type_dist = analyzer.run_full_analysis(top_n=10, type_column="stock")

# Step 6: Inspect results if needed
print("\n--- Top 10 Publishers ---")
print(publisher_counts.head(10))

print("\n--- Top 10 Domains ---")
print(domain_counts.head(10))

print("\n--- News Type Distribution per Publisher (first 10 rows) ---")
print(news_type_dist.head(10))
