In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# File paths
RAW_DATA_PATH = "C:\\Users\\hayyu.ragea\\AppData\\Local\\Programs\\Python\\Python312\\AI_Mastery_Week_1\\data\\raw_analyst_ratings.csv"
HISTORICAL_DATA_PATHS = {
    "AAPL": "C:\\Users\\hayyu.ragea\\AppData\\Local\\Programs\\Python\\Python312\\AI_Mastery_Week_1\\data\\AAPL_historical_data.csv",
    "AMZN": "C:\\Users\\hayyu.ragea\\AppData\\Local\\Programs\\Python\\Python312\\AI_Mastery_Week_1\\data\\AMZN_historical_data.csv",
    "GOOG": "C:\\Users\\hayyu.ragea\\AppData\\Local\\Programs\\Python\\Python312\\AI_Mastery_Week_1\\data\\GOOG_historical_data.csv",
    "META": "C:\\Users\\hayyu.ragea\\AppData\\Local\\Programs\\Python\\Python312\\AI_Mastery_Week_1\\data\\META_historical_data.csv",
    "MSFT": "C:\\Users\\hayyu.ragea\\AppData\\Local\\Programs\\Python\\Python312\\AI_Mastery_Week_1\\data\\MSFT_historical_data.csv",
    "NVDA": "C:\\Users\\hayyu.ragea\\AppData\\Local\\Programs\\Python\\Python312\\AI_Mastery_Week_1\\data\\NVDA_historical_data.csv",
    "TSLA": "C:\\Users\\hayyu.ragea\\AppData\\Local\\Programs\\Python\\Python312\\AI_Mastery_Week_1\\data\\TSLA_historical_data.csv",
}

# Load data
news_data = pd.read_csv(RAW_DATA_PATH)
stock_data = {ticker: pd.read_csv(path) for ticker, path in HISTORICAL_DATA_PATHS.items()}

# Display and inspect data
print("News Data Sample:\n", news_data.head(), "\n")
print("Stock Data Sample for AAPL:\n", stock_data["AAPL"].head(), "\n")

# Summary statistics
print("News Data Summary:\n", news_data.describe(include='all'), "\n")
print("Stock Data Summary for AAPL:\n", stock_data["AAPL"].describe(), "\n")

# Data quality checks
print("News Data Missing Values:\n", news_data.isnull().sum(), "\n")
print("Stock Data Missing Values for AAPL:\n", stock_data["AAPL"].isnull().sum(), "\n")

# Time Series Analysis
news_data['date'] = pd.to_datetime(news_data['date'])
plt.figure(figsize=(14, 7))
news_data['date'].value_counts().sort_index().plot()
plt.title("Number of News Articles Over Time")
plt.xlabel("Date")
plt.ylabel("Number of Articles")
plt.grid()
plt.show()

stock_data["AAPL"]['Date'] = pd.to_datetime(stock_data["AAPL"]['Date'])
plt.figure(figsize=(14, 7))
plt.plot(stock_data["AAPL"]['Date'], stock_data["AAPL"]['Close'], label='Close Price')
plt.title("AAPL Stock Closing Price Over Time")
plt.xlabel("Date")
plt.ylabel("Close Price")
plt.legend()
plt.grid()
plt.show()

# Correlation Analysis
news_data['Date'] = news_data['date'].dt.date
stock_data["AAPL"]['Date'] = stock_data["AAPL"]['Date'].dt.date
merged_data = pd.merge(news_data[['Date', 'Sentiment']], stock_data["AAPL"][['Date', 'Close']], on='Date', how='inner')
merged_data['Daily_Return'] = merged_data['Close'].pct_change()
merged_data = merged_data.dropna()
correlation = merged_data[['Sentiment', 'Daily_Return']].corr().iloc[0, 1]
print("Correlation between Sentiment and Daily Returns:", correlation, "\n")

# Visualization of Correlations
correlation_matrix = merged_data[['Sentiment', 'Daily_Return']].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Heatmap")
plt.show()
