### Data Alignment

In [52]:
import pandas as pd
import os

NEWS_FILE = '../data/raw_analyst_ratings.csv'
BASE_PATH = '../data/yfinance_data/'
STOCK_FILES = {
    'AAPL': 'AAPL_historical_data.csv',
    'AMZN': 'AMZN_historical_data.csv',
    'GOOG': 'GOOG_historical_data.csv',
    'META': 'META_historical_data.csv',
    'MSFT': 'MSFT_historical_data.csv',
    'NVDA': 'NVDA_historical_data.csv',
    'TSLA': 'TSLA_historical_data.csv'
}

news_data = pd.read_csv(NEWS_FILE)
stock_data_list = []
for ticker, file in STOCK_FILES.items():
    file_path = os.path.join(BASE_PATH, file)
    if os.path.exists(file_path):
        data = pd.read_csv(file_path)
        data['Stock'] = ticker
        stock_data_list.append(data)
    else:
        print(f"File not found: {file_path}")
stock_data = pd.concat(stock_data_list, ignore_index=True)

news_data['date'] = pd.to_datetime(news_data['date'], format='mixed', utc=True)
news_data['date'] = news_data['date'].dt.tz_convert('America/New_York').dt.date
news_data.rename(columns={'date': 'Date', 'stock': 'Stock'}, inplace=True)
stock_data['Date'] = pd.to_datetime(stock_data['Date']).dt.date

news_data = news_data[news_data['Stock'].isin(STOCK_FILES.keys())]
min_date = max(news_data['Date'].min(), stock_data['Date'].min())
max_date = min(news_data['Date'].max(), stock_data['Date'].max())
news_data = news_data[(news_data['Date'] >= min_date) & (news_data['Date'] <= max_date)]
stock_data = stock_data[(stock_data['Date'] >= min_date) & (stock_data['Date'] <= max_date)]
merged_data = pd.merge(news_data, stock_data, on=['Date', 'Stock'], how='left')
merged_data['Close'] = merged_data.groupby('Stock')['Close'].ffill()
merged_data = merged_data.dropna(subset=['Close'])
merged_data.to_csv('../data/merged_news_stock.csv', index=False)