In [2]:
import pandas as pd

# Define stock dataset info
stock_dataset_info = [
    ("AAPL", "../src/AAPL_historical_data.csv"),
    ("AMZN", "../src/AMZN_historical_data.csv"),
    ("GOOG", "../src/GOOG_historical_data.csv"),
    ("META", "../src/META_historical_data.csv"),
    ("MSFT", "../src/MSFT_historical_data.csv"),
    ("NVDA", "../src/NVDA_historical_data.csv"),
    ("TSLA", "../src/TSLA_historical_data.csv")
]

# Load stock datasets into a dictionary
stock_dataframes = {}
for stock_symbol, file_path in stock_dataset_info:
    df = pd.read_csv(file_path)
    df['Date'] = pd.to_datetime(df['Date'])  # Convert date column
    df.set_index('Date', inplace=True)  # Set date as index
    stock_dataframes[stock_symbol] = df

# Load news headlines dataset
headlines_df = pd.read_csv('../src/raw_analyst_ratings.csv')  # Adjust the path as necessary

# Convert 'date' to datetime, coercing errors to NaT
headlines_df['date'] = pd.to_datetime(headlines_df['date'], errors='coerce')

# Drop rows where 'date' is NaT
headlines_df.dropna(subset=['date'], inplace=True)

# Set 'date' as the index
headlines_df.set_index('date', inplace=True)

headlines_df.head()

Unnamed: 0_level_0,Unnamed: 0,headline,url,publisher,stock
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-06-05 10:30:54-04:00,0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,A
2020-06-03 10:45:20-04:00,1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,A
2020-05-26 04:30:07-04:00,2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,A
2020-05-22 12:45:06-04:00,3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,A
2020-05-22 11:38:59-04:00,4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,A


Data Alignment

In [7]:
# Create a dictionary to hold the aligned data for each stock
aligned_data = {}

# Resample news to daily frequency, aggregating headlines for each ticker
for stock_symbol, stock_file in stock_dataset_info:
    # Load the stock DataFrame
    stock_df = pd.read_csv(stock_file)  # Adjust the path as necessary
    stock_df['Date'] = pd.to_datetime(stock_df['Date'])  # Convert 'Date' to datetime
    stock_df.set_index('Date', inplace=True)  # Set 'Date' as index

    # Ensure stock_df is timezone-naive
    if stock_df.index.tz is not None:
        stock_df.index = stock_df.index.tz_localize(None)

    # Filter headlines for the current stock ticker
    ticker_headlines = headlines_df[headlines_df['headline'].str.contains(stock_symbol, na=False, case=False)]

    # Count the number of headlines for the current ticker
    num_headlines = ticker_headlines.shape[0]
    print(f"Total headlines for {stock_symbol}: {num_headlines}")

    # Resample the filtered headlines to daily frequency, aggregating
    news_daily = ticker_headlines.resample('D').agg({'headline': ' '.join})

    # Drop rows where there are no headlines
    news_daily.dropna(subset=['headline'], inplace=True)

    # Ensure news_daily is timezone-naive
    if news_daily.index.tz is not None:
        news_daily.index = news_daily.index.tz_localize(None)

    # Merge stock DataFrame with news DataFrame
    merged_df = pd.merge(stock_df, news_daily, left_index=True, right_index=True, how='inner')
    aligned_data[stock_symbol] = merged_df

    # Display the headlines for the current stock symbol
    print("")
    print(f"Aligned data for {stock_symbol}:")
    print("")
    print(news_daily.head(100))
    print("")


Total headlines for AAPL: 16

Aligned data for AAPL:

                                                     headline
date                                                         
2012-02-24  ETF Showdown: Grasping For Growth (VUG, IVW, A...
2012-02-25                                                   
2012-02-26                                                   
2012-02-27  More Tech ETFs That Don't Need Apple (AAPL, PX...
2012-02-28                                                   
...                                                       ...
2012-05-29                                                   
2012-05-30                                                   
2012-05-31                                                   
2012-06-01                                                   
2012-06-02                                                   

[100 rows x 1 columns]
Total headlines for AMZN: 2

Aligned data for AMZN:

                                                     headline
d

Sentiment Analysis


In [8]:
from textblob import TextBlob

# Function to get sentiment polarity
def get_sentiment(headline):
    return TextBlob(headline).sentiment.polarity  # Returns a value between -1 and 1

# Apply sentiment analysis to each aligned dataset
for stock_symbol, merged_df in aligned_data.items():
    merged_df['sentiment'] = merged_df['headline'].apply(get_sentiment)
    print(merged_df)

                 Open       High        Low      Close  Adj Close     Volume  \
2012-02-24  18.559643  18.674999  18.522858  18.657499  15.755797  415072000   
2012-02-27  18.618214  18.875000  18.438572  18.777143  15.856838  547582000   
2012-02-28  18.855715  19.121786  18.780357  19.121786  16.147873  600387200   
2012-02-29  19.341429  19.557501  19.132143  19.372856  16.359896  952011200   
2012-03-01  19.577499  19.578930  19.241785  19.445356  16.421124  683270000   
...               ...        ...        ...        ...        ...        ...   
2020-06-04  81.097504  81.404999  80.195000  80.580002  78.604675   87560400   
2020-06-05  80.837502  82.937500  80.807503  82.875000  80.843407  137250400   
2020-06-08  82.562500  83.400002  81.830002  83.364998  81.321381   95654400   
2020-06-09  83.035004  86.402496  83.002502  85.997498  83.889359  147712400   
2020-06-10  86.974998  88.692497  86.522499  88.209999  86.047615  166651600   

            Dividends  Stock Splits  \


Calculate Daily Stock Returns


In [9]:
# Calculate daily returns
for stock_symbol, merged_df in aligned_data.items():
    merged_df['daily_return'] = merged_df['Close'].pct_change()
    print(stock_symbol)
    print()
    print(merged_df)
    print()


AAPL

                 Open       High        Low      Close  Adj Close     Volume  \
2012-02-24  18.559643  18.674999  18.522858  18.657499  15.755797  415072000   
2012-02-27  18.618214  18.875000  18.438572  18.777143  15.856838  547582000   
2012-02-28  18.855715  19.121786  18.780357  19.121786  16.147873  600387200   
2012-02-29  19.341429  19.557501  19.132143  19.372856  16.359896  952011200   
2012-03-01  19.577499  19.578930  19.241785  19.445356  16.421124  683270000   
...               ...        ...        ...        ...        ...        ...   
2020-06-04  81.097504  81.404999  80.195000  80.580002  78.604675   87560400   
2020-06-05  80.837502  82.937500  80.807503  82.875000  80.843407  137250400   
2020-06-08  82.562500  83.400002  81.830002  83.364998  81.321381   95654400   
2020-06-09  83.035004  86.402496  83.002502  85.997498  83.889359  147712400   
2020-06-10  86.974998  88.692497  86.522499  88.209999  86.047615  166651600   

            Dividends  Stock Spli

Correlation Analysis

In [11]:
# Correlation analysis
for stock_symbol, merged_df in aligned_data.items():
    correlation_data = merged_df[['sentiment', 'daily_return']].dropna()  # Drop NaN values
    correlation = correlation_data.corr().iloc[0, 1]  # Correlation between sentiment and daily return
    print(f'Correlation between news sentiment and stock returns for {stock_symbol}: {correlation}')

Correlation between news sentiment and stock returns for AAPL: 0.03634511736782484
Correlation between news sentiment and stock returns for AMZN: nan
Correlation between news sentiment and stock returns for GOOG: 0.05368620966541132
Correlation between news sentiment and stock returns for META: -0.008287498512342863
Correlation between news sentiment and stock returns for MSFT: nan
Correlation between news sentiment and stock returns for NVDA: nan
Correlation between news sentiment and stock returns for TSLA: 0.005869637172693952
