# Data Exploration for Energy Equity Market Sentiment Analysis

This notebook explores the data collected for energy sector companies including stock prices, news sentiment, and social media data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path
import sys

# Add project root to path
sys.path.append('..')
from src.config import RAW, PRO, NEWS, TWEETS, TICKERS

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

## Load Stock Price Data

In [None]:
# Load stock data for each ticker
stock_data = {}
for ticker in TICKERS:
    clean_ticker = ticker.replace('.', '_')
    filepath = RAW / f"{clean_ticker}_stock_data.csv"
    
    if filepath.exists():
        df = pd.read_csv(filepath, index_col=0, parse_dates=True)
        stock_data[ticker] = df
        print(f"Loaded data for {ticker}: {df.shape[0]} rows from {df.index.min()} to {df.index.max()}")
    else:
        print(f"No data found for {ticker}")

## Exploratory Data Analysis

In [None]:
# Select a ticker for analysis
ticker = "XOM"

if ticker in stock_data:
    df = stock_data[ticker]
    
    # Display basic info
    print(f"=== {ticker} Stock Data ===\n")
    print(f"Date range: {df.index.min()} to {df.index.max()}")
    print(f"Trading days: {df.shape[0]}")
    print("\nFirst few rows:")
    display(df.head())
    
    print("\nSummary statistics:")
    display(df.describe())
else:
    print(f"No data available for {ticker}")

## Plot Stock Price History

In [None]:
if ticker in stock_data:
    df = stock_data[ticker]
    
    fig, ax = plt.subplots(figsize=(14, 7))
    
    # Plot closing price
    ax.plot(df.index, df['Close'], label='Close Price')
    
    # Add title and labels
    ax.set_title(f'{ticker} Stock Price History', fontsize=16)
    ax.set_xlabel('Date', fontsize=12)
    ax.set_ylabel('Price', fontsize=12)
    ax.legend()
    
    plt.tight_layout()
    plt.show()
    
    # Plot daily returns
    if 'return' in df.columns:
        fig, ax = plt.subplots(figsize=(14, 7))
        
        ax.plot(df.index, df['return'] * 100)  # Convert to percentage
        ax.axhline(y=0, color='r', linestyle='-', alpha=0.3)
        
        ax.set_title(f'{ticker} Daily Returns', fontsize=16)
        ax.set_xlabel('Date', fontsize=12)
        ax.set_ylabel('Return (%)', fontsize=12)
        
        plt.tight_layout()
        plt.show()

## Load News Data

In [None]:
# Load news data
def load_news_data(ticker):
    clean_ticker = ticker.replace('.', '_')
    news_files = list(NEWS.glob(f"{clean_ticker}_news_*.json"))
    
    if not news_files:
        print(f"No news data found for {ticker}")
        return None
    
    # Get the most recent file
    latest_file = max(news_files, key=lambda x: x.stat().st_mtime)
    print(f"Loading news from {latest_file}")
    
    with open(latest_file, 'r') as f:
        news_data = json.load(f)
    
    if 'articles' not in news_data:
        print(f"No articles found in news data for {ticker}")
        return None
    
    articles = news_data['articles']
    print(f"Found {len(articles)} news articles for {ticker}")
    
    # Convert to DataFrame
    df = pd.DataFrame(articles)
    
    # Convert dates
    if 'publishedAt' in df.columns:
        df['publishedAt'] = pd.to_datetime(df['publishedAt'])
    
    return df

news_df = load_news_data(ticker)

if news_df is not None:
    print("\nNews data columns:")
    print(news_df.columns.tolist())
    
    print("\nFirst few news articles:")
    display(news_df[['publishedAt', 'title', 'source']].head())

## Analyze News Sources

In [None]:
if news_df is not None and 'source' in news_df.columns:
    # Extract source names
    source_names = news_df['source'].apply(lambda x: x.get('name') if isinstance(x, dict) else x)
    
    # Count sources
    source_counts = source_names.value_counts()
    
    # Plot top sources
    plt.figure(figsize=(12, 8))
    source_counts[:15].plot(kind='barh')
    plt.title(f'Top News Sources for {ticker}')
    plt.xlabel('Number of Articles')
    plt.tight_layout()
    plt.show()

## Analyze News Publication Times

In [None]:
if news_df is not None and 'publishedAt' in news_df.columns:
    # Extract hour of day
    news_df['hour'] = news_df['publishedAt'].dt.hour
    
    # Plot distribution of publication times
    plt.figure(figsize=(12, 6))
    news_df['hour'].value_counts().sort_index().plot(kind='bar')
    plt.title(f'Distribution of News Publication Hours for {ticker}')
    plt.xlabel('Hour of Day (UTC)')
    plt.ylabel('Number of Articles')
    plt.xticks(range(0, 24))
    plt.tight_layout()
    plt.show()

## Next Steps

In the following notebooks, we'll:

1. Preprocess the data and extract sentiment from news articles
2. Build and train machine learning models to predict stock movements
3. Evaluate model performance and explore feature importance
4. Visualize the relationships between sentiment and stock price movements