# Major Imports

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer


# --- 1. Load Data ---

In [None]:
try:
    # Adjust path if your CSV is elsewhere
    news_df = pd.read_csv('../data/raw/financial_news.csv')
except FileNotFoundError:
    print("Error: financial_news.csv not found. Please ensure it's in the 'data/raw' directory.")


print("--- Data Understanding ---")
print("Shape:", news_df.shape)
print("\nInfo:")
news_df.info()
print("\nFirst 5 rows:")
print(news_df.head())
print("\nMissing values:")
print(news_df.isnull().sum())

# --- 2. Data Cleaning & Preprocessing (Basic) ---

In [None]:
# Convert 'date' to datetime objects
# The format includes timezone offset, pandas handles this well
news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce') # errors='coerce' will turn unparseable dates into NaT

# Drop rows where date conversion failed (if any)
news_df.dropna(subset=['date'], inplace=True)

# Ensure 'headline' is string
news_df['headline'] = news_df['headline'].astype(str)

# --- 3. Descriptive Statistics ---

In [None]:
print("\n--- Descriptive Statistics ---")
# Headline length
news_df['headline_length'] = news_df['headline'].apply(len)
print("\nHeadline Length Stats:")
print(news_df['headline_length'].describe())

plt.figure(figsize=(10, 6))
sns.histplot(news_df['headline_length'], bins=50, kde=True)
plt.title('Distribution of Headline Lengths')
plt.xlabel('Headline Length (characters)')
plt.ylabel('Frequency')
plt.show()

# Articles per publisher
publisher_counts = news_df['publisher'].value_counts().nlargest(20) # Top 20
print("\nTop 20 Most Active Publishers:")
print(publisher_counts)

plt.figure(figsize=(12, 8))
sns.barplot(x=publisher_counts.values, y=publisher_counts.index, palette='viridis')
plt.title('Number of Articles per Publisher (Top 20)')
plt.xlabel('Number of Articles')
plt.ylabel('Publisher')
plt.tight_layout()
plt.show()

# Publication dates trends
news_df.set_index('date', inplace=True) # Set date as index for time series analysis
articles_per_day = news_df['headline'].resample('D').count()

plt.figure(figsize=(15, 7))
articles_per_day.plot()
plt.title('Number of Articles Published Over Time (Daily)')
plt.xlabel('Date')
plt.ylabel('Number of Articles')
plt.show()

# Day of week analysis
news_df['day_of_week'] = news_df.index.day_name()
day_of_week_counts = news_df['day_of_week'].value_counts().reindex([
    'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'
])
plt.figure(figsize=(10, 6))
sns.barplot(x=day_of_week_counts.index, y=day_of_week_counts.values)
plt.title('Number of Articles by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Number of Articles')
plt.show()

# Reset index if needed for further non-time-series operations, or keep it if mainly doing time-based
news_df.reset_index(inplace=True)


# --- 4. Text Analysis (Basic Keywords/Topic Modeling Idea) ---

In [None]:
print("\n--- Basic Text Analysis ---")
stop_words = set(stopwords.words('english'))

# Simple keyword extraction from headlines
all_headlines_text = ' '.join(news_df['headline'].str.lower())
words = word_tokenize(all_headlines_text)
filtered_words = [word for word in words if word.isalnum() and word not in stop_words]
word_counts = Counter(filtered_words)
print("\nMost common words in headlines (Top 20):")
print(word_counts.most_common(20))

# Using CountVectorizer for N-grams (e.g., bi-grams)
vectorizer = CountVectorizer(ngram_range=(2, 2), stop_words='english', max_features=20) # Top 20 bigrams
bigrams = vectorizer.fit_transform(news_df['headline'])
bigram_counts = pd.DataFrame({
    'bigram': vectorizer.get_feature_names_out(),
    'count': bigrams.sum(axis=0).A1 # .A1 converts matrix to 1D array
}).sort_values('count', ascending=False)
print("\nMost common bigrams (Top 20):")
print(bigram_counts)
