### Data Alignment

In [52]:
import pandas as pd
import os

NEWS_FILE = '../data/raw_analyst_ratings.csv'
BASE_PATH = '../data/yfinance_data/'
STOCK_FILES = {
    'AAPL': 'AAPL_historical_data.csv',
    'AMZN': 'AMZN_historical_data.csv',
    'GOOG': 'GOOG_historical_data.csv',
    'META': 'META_historical_data.csv',
    'MSFT': 'MSFT_historical_data.csv',
    'NVDA': 'NVDA_historical_data.csv',
    'TSLA': 'TSLA_historical_data.csv'
}

news_data = pd.read_csv(NEWS_FILE)
stock_data_list = []
for ticker, file in STOCK_FILES.items():
    file_path = os.path.join(BASE_PATH, file)
    if os.path.exists(file_path):
        data = pd.read_csv(file_path)
        data['Stock'] = ticker
        stock_data_list.append(data)
    else:
        print(f"File not found: {file_path}")
stock_data = pd.concat(stock_data_list, ignore_index=True)

news_data['date'] = pd.to_datetime(news_data['date'], format='mixed', utc=True)
news_data['date'] = news_data['date'].dt.tz_convert('America/New_York').dt.date
news_data.rename(columns={'date': 'Date', 'stock': 'Stock'}, inplace=True)
stock_data['Date'] = pd.to_datetime(stock_data['Date']).dt.date

news_data = news_data[news_data['Stock'].isin(STOCK_FILES.keys())]
min_date = max(news_data['Date'].min(), stock_data['Date'].min())
max_date = min(news_data['Date'].max(), stock_data['Date'].max())
news_data = news_data[(news_data['Date'] >= min_date) & (news_data['Date'] <= max_date)]
stock_data = stock_data[(stock_data['Date'] >= min_date) & (stock_data['Date'] <= max_date)]
merged_data = pd.merge(news_data, stock_data, on=['Date', 'Stock'], how='left')
merged_data['Close'] = merged_data.groupby('Stock')['Close'].ffill()
merged_data = merged_data.dropna(subset=['Close'])
merged_data.to_csv('../data/merged_news_stock.csv', index=False)

### Sentiment Analysis

In [53]:
from textblob import TextBlob

news_data = pd.read_csv('../data/merged_news_stock.csv')

def get_sentiment(headline):
    return TextBlob(str(headline)).sentiment.polarity

news_data['Sentiment'] = news_data['headline'].apply(get_sentiment)
news_data.to_csv('../data/news_with_sentiment.csv', index=False)

### Calculate Daily Stock Returns

In [55]:
stock_data = pd.read_csv('../data/merged_news_stock.csv')
stock_data = stock_data.sort_values(['Stock', 'Date'])
stock_data['Daily_Return'] = stock_data.groupby('Stock')['Close'].pct_change() * 100
stock_data = stock_data.dropna(subset=['Daily_Return'])
stock_data.to_csv('../data/stock_with_returns.csv', index=False)
new_data = pd.read_csv('../data/stock_with_returns.csv')
news_data.head()

Unnamed: 0.1,Unnamed: 0,headline,url,publisher,Date,Stock,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits,Sentiment
0,7120,Tech Stocks And FAANGS Strong Again To Start D...,https://www.benzinga.com/government/20/06/1622...,JJ Kinahan,2020-06-10,AAPL,86.974998,88.692497,86.522499,88.209999,86.047615,166651600.0,0.0,0.0,0.433333
1,7121,10 Biggest Price Target Changes For Wednesday,https://www.benzinga.com/analyst-ratings/price...,Lisa Levin,2020-06-10,AAPL,86.974998,88.692497,86.522499,88.209999,86.047615,166651600.0,0.0,0.0,0.0
2,7122,"Benzinga Pro's Top 5 Stocks To Watch For Wed.,...",https://www.benzinga.com/short-sellers/20/06/1...,Benzinga Newsdesk,2020-06-10,AAPL,86.974998,88.692497,86.522499,88.209999,86.047615,166651600.0,0.0,0.0,0.5
3,7123,"Deutsche Bank Maintains Buy on Apple, Raises P...",https://www.benzinga.com/news/20/06/16219873/d...,Benzinga Newsdesk,2020-06-10,AAPL,86.974998,88.692497,86.522499,88.209999,86.047615,166651600.0,0.0,0.0,0.0
4,7124,Apple To Let Users Trade In Their Mac Computer...,https://www.benzinga.com/news/20/06/16218697/a...,Neer Varshney,2020-06-10,AAPL,86.974998,88.692497,86.522499,88.209999,86.047615,166651600.0,0.0,0.0,0.0
