In [None]:
# Exploratory Data Analysis (EDA)

In this notebook, we will perform Exploratory Data Analysis (EDA) on the financial and sentiment data. Our objectives are to understand data distributions, identify missing values, and visualize key trends.

## 1. Import Libraries

We'll begin by importing the necessary libraries.

```python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style="whitegrid")
# Define file paths
raw_analyst_ratings_path = 'C:/Users/hayyu.ragea/AppData/Local/Programs/Python/Python312/AI_Mastery_Week_1/data/raw_analyst_ratings.csv'

# Historical stock data file paths
historical_data_paths = {
    'AAPL': 'C:/Users/hayyu.ragea/AppData/Local/Programs/Python/Python312/AI_Mastery_Week_1/data/AAPL_historical_data.csv',
    'AMZN': 'C:/Users/hayyu.ragea/AppData/Local/Programs/Python/Python312/AI_Mastery_Week_1/data/AMZN_historical_data.csv',
    'GOOG': 'C:/Users/hayyu.ragea/AppData/Local/Programs/Python/Python312/AI_Mastery_Week_1/data/GOOG_historical_data.csv',
    'META': 'C:/Users/hayyu.ragea/AppData/Local/Programs/Python/Python312/AI_Mastery_Week_1/data/META_historical_data.csv',
    'MSFT': 'C:/Users/hayyu.ragea/AppData/Local/Programs/Python/Python312/AI_Mastery_Week_1/data/MSFT_historical_data.csv',
    'NVDA': 'C:/Users/hayyu.ragea/AppData/Local/Programs/Python/Python312/AI_Mastery_Week_1/data/NVDA_historical_data.csv',
    'TSLA': 'C:/Users/hayyu.ragea/AppData/Local/Programs/Python/Python312/AI_Mastery_Week_1/data/TSLA_historical_data.csv'
}

# Load raw analyst ratings data
raw_analyst_ratings = pd.read_csv(raw_analyst_ratings_path)

# Load historical stock data
historical_data = {ticker: pd.read_csv(path) for ticker, path in historical_data_paths.items()}

# Display the first few rows of raw analyst ratings
raw_analyst_ratings.head()
# Data overview
print("Raw Analyst Ratings Data Info:")
raw_analyst_ratings.info()

# Summary statistics
print("\nRaw Analyst Ratings Summary Statistics:")
raw_analyst_ratings.describe()
# Display the first few rows of historical data for each ticker
for ticker, data in historical_data.items():
    print(f"\n{ticker} Historical Data Info:")
    print(data.info())
    
    # Summary statistics
    print(f"\n{ticker} Historical Data Summary Statistics:")
    print(data.describe())
# Check for missing values in raw analyst ratings
print("Missing values in Raw Analyst Ratings:")
print(raw_analyst_ratings.isnull().sum())

# Handle missing values
# Example: Fill missing values in 'headline' with an empty string
raw_analyst_ratings['headline'].fillna('', inplace=True)

# Check for missing values in historical stock data
for ticker, data in historical_data.items():
    print(f"\nMissing values in {ticker} Historical Data:")
    print(data.isnull().sum())
    
    # Handle missing values if necessary
    # For example, fill missing values in 'Close' with forward fill
    data['Close'].fillna(method='ffill', inplace=True)
# Convert date columns to datetime
raw_analyst_ratings['date'] = pd.to_datetime(raw_analyst_ratings['date'])
for data in historical_data.values():
    data['date'] = pd.to_datetime(data['date'])

# Set date columns as index
raw_analyst_ratings.set_index('date', inplace=True)
for ticker, data in historical_data.items():
    data.set_index('date', inplace=True)
# Plot sentiment distribution
plt.figure(figsize=(10, 6))
sns.histplot(raw_analyst_ratings['sentiment'], bins=20, kde=True)
plt.title('Distribution of Sentiment Scores')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.show()
# Plot historical stock prices for AAPL as an example
plt.figure(figsize=(14, 7))
plt.plot(historical_data['AAPL']['Close'], label='AAPL Close Price')
plt.title('AAPL Historical Stock Prices')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.show()
# Plot moving average for AAPL as an example
window_size = 30
aapl_data = historical_data['AAPL']
aapl_data['SMA'] = aapl_data['Close'].rolling(window=window_size).mean()

plt.figure(figsize=(14, 7))
plt.plot(aapl_data['Close'], label='AAPL Close Price')
plt.plot(aapl_data['SMA'], label=f'{window_size}-Day SMA', color='orange')
plt.title('AAPL Stock Price and Moving Average')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.show()
# Calculate daily returns for AAPL
aapl_data['Daily Return'] = aapl_data['Close'].pct_change()

# Correlation matrix
plt.figure(figsize=(10, 6))
correlation_matrix = aapl_data[['Close', 'Daily Return']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('AAPL Correlation Matrix')
plt.show()
