# Major Imports

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer


# --- 1. Load Data ---

In [None]:
try:
    # Adjust path if your CSV is elsewhere
    news_df = pd.read_csv('../data/raw/financial_news.csv')
except FileNotFoundError:
    print("Error: financial_news.csv not found. Please ensure it's in the 'data/raw' directory.")


print("--- Data Understanding ---")
print("Shape:", news_df.shape)
print("\nInfo:")
news_df.info()
print("\nFirst 5 rows:")
print(news_df.head())
print("\nMissing values:")
print(news_df.isnull().sum())

# --- 2. Data Cleaning & Preprocessing (Basic) ---

In [None]:
# Convert 'date' to datetime objects
# The format includes timezone offset, pandas handles this well
news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce') # errors='coerce' will turn unparseable dates into NaT

# Drop rows where date conversion failed (if any)
news_df.dropna(subset=['date'], inplace=True)

# Ensure 'headline' is string
news_df['headline'] = news_df['headline'].astype(str)