## Correlation between news and stock movement


In [1]:
import os

os.chdir("..")  # set the working directory one level up
from scripts.financial_analyzer import FinancialAnalyzer
from scripts.news_analyzer import NewsAnalyzer
import pandas as pd

f_analyzer = FinancialAnalyzer()
n_analyzer = NewsAnalyzer()

### load all stock data ,and add `ticker` column 

In [2]:
tickers = ["AAPL", "AMZN", "GOOG", "META", "MSFT", "NVDA", "TSLA"]
stock_data = f_analyzer.load_datas(tickers)
stock_data

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits,Stock
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1980-12-12,0.128348,0.128906,0.128348,0.128348,0.098943,469033600,0.0,0.0,AAPL
1980-12-15,0.122210,0.122210,0.121652,0.121652,0.093781,175884800,0.0,0.0,AAPL
1980-12-16,0.113281,0.113281,0.112723,0.112723,0.086898,105728000,0.0,0.0,AAPL
1980-12-17,0.115513,0.116071,0.115513,0.115513,0.089049,86441600,0.0,0.0,AAPL
1980-12-18,0.118862,0.119420,0.118862,0.118862,0.091630,73449600,0.0,0.0,AAPL
...,...,...,...,...,...,...,...,...,...
2024-07-24,225.419998,225.990005,214.710007,215.990005,215.990005,167942900,0.0,0.0,TSLA
2024-07-25,216.800003,226.000000,216.229996,220.250000,220.250000,100636500,0.0,0.0,TSLA
2024-07-26,221.190002,222.279999,215.330002,219.800003,219.800003,94604100,0.0,0.0,TSLA
2024-07-29,224.899994,234.270004,224.699997,232.100006,232.100006,129201800,0.0,0.0,TSLA


### Load normalized `news data`

In [3]:
news_data = f_analyzer.load_data("datasets/raw_analyst_ratings.csv", isPath=True)

news_data.drop(columns="Unnamed: 0", errors="ignore", inplace=True)
news_data.head()

Unnamed: 0_level_0,headline,url,publisher,Stock
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-06-05,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,A
2020-06-03,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,A
2020-05-26,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,A
2020-05-22,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,A
2020-05-22,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,A


## `Descriptive` statistics    

In [4]:
news_data.describe()

Unnamed: 0,headline,url,publisher,Stock
count,1407328,1407328,1407328,1407328
unique,845770,883429,1034,6204
top,Benzinga's Top Upgrades,https://www.benzinga.com/news/20/03/15538835/s...,Paul Quintaro,MRK
freq,5449,1704,228373,3333


## `Sentiment`:Aggregate Sentiments: Compute the average daily sentiment scores per stock


In [5]:
news_data = n_analyzer.sentiment(news_data)
news_data.head()

Unnamed: 0_level_0,headline,url,publisher,Stock,Sentiment Scores,Sentiment,Compound Score
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-06-05,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,A,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neutral,0.0
2020-06-03,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,A,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neutral,0.0
2020-05-26,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,A,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neutral,0.0
2020-05-22,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,A,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neutral,0.0
2020-05-22,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,A,"{'neg': 0.0, 'neu': 0.833, 'pos': 0.167, 'comp...",positive,0.296


## Daily Stock `Returns`:


In [6]:
stock_data["Return"] = stock_data["Close"].pct_change() * 100
stock_data

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits,Stock,Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1980-12-12,0.128348,0.128906,0.128348,0.128348,0.098943,469033600,0.0,0.0,AAPL,
1980-12-15,0.122210,0.122210,0.121652,0.121652,0.093781,175884800,0.0,0.0,AAPL,-5.217061
1980-12-16,0.113281,0.113281,0.112723,0.112723,0.086898,105728000,0.0,0.0,AAPL,-7.339788
1980-12-17,0.115513,0.116071,0.115513,0.115513,0.089049,86441600,0.0,0.0,AAPL,2.475091
1980-12-18,0.118862,0.119420,0.118862,0.118862,0.091630,73449600,0.0,0.0,AAPL,2.899246
...,...,...,...,...,...,...,...,...,...,...
2024-07-24,225.419998,225.990005,214.710007,215.990005,215.990005,167942900,0.0,0.0,TSLA,-12.334605
2024-07-25,216.800003,226.000000,216.229996,220.250000,220.250000,100636500,0.0,0.0,TSLA,1.972311
2024-07-26,221.190002,222.279999,215.330002,219.800003,219.800003,94604100,0.0,0.0,TSLA,-0.204312
2024-07-29,224.899994,234.270004,224.699997,232.100006,232.100006,129201800,0.0,0.0,TSLA,5.595998


## Merge the to `news_data` and `stock_data` using `date` and `stock` columns

###  so we can analyze for each stock

In [9]:
# Aggregate Sentiments: Compute the average daily sentiment scores

daily_sentiment = (
    news_data.groupby(["Date", "Stock"])["Compound Score"].mean().reset_index()
)


# Ensure the 'Date' columns are in datetime format
stock_data = stock_data.reset_index()
stock_data["Date"] = pd.to_datetime(stock_data["Date"])
daily_sentiment["Date"] = pd.to_datetime(daily_sentiment["Date"])
df_merged = pd.merge(stock_data, daily_sentiment, on=["Date", "Stock"])
df_merged

# #
# # news_data = news_data.reset_index()
# df_merged = pd.merge(stock_data, news_data, on=["Date", "Stock"])
# correlation = df_merged["Return"].corr(df_merged["Sentiment"])

# print(f"Correlation between daily returns and sentiment scores: {correlation:.2f}")

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits,Stock,Return,Compound Score
0,2020-03-09,65.937500,69.522499,65.750000,66.542503,64.736046,286744800,0.0,0.0,AAPL,-7.909209,-0.302067
1,2020-03-10,69.285004,71.610001,67.342499,71.334999,69.398438,285290000,0.0,0.0,AAPL,7.202157,-0.090787
2,2020-03-11,69.347504,70.305000,67.964996,68.857498,66.988190,255598800,0.0,0.0,AAPL,-3.473051,-0.023850
3,2020-03-12,63.985001,67.500000,62.000000,62.057499,60.372791,418474000,0.0,0.0,AAPL,-9.875467,-0.078360
4,2020-03-13,66.222504,69.980003,63.237499,69.492500,67.605965,370732000,0.0,0.0,AAPL,11.980827,-0.059727
...,...,...,...,...,...,...,...,...,...,...,...,...
1791,2020-05-28,54.234001,54.983334,53.445999,53.720669,53.720669,108834000,0.0,0.0,TSLA,-1.758038,0.179271
1792,2020-05-29,53.916668,55.666668,53.613998,55.666668,55.666668,176467500,0.0,0.0,TSLA,3.622440,0.050575
1793,2020-06-01,57.200001,59.933334,56.939999,59.873333,59.873333,224092500,0.0,0.0,TSLA,7.556883,0.253208
1794,2020-06-02,59.646667,60.577332,58.066666,58.770668,58.770668,203484000,0.0,0.0,TSLA,-1.841663,0.184029


In [None]:
print("Columns in stock_data:", stock_data.columns)
print("Columns in news_data:", news_data.columns)

In [12]:
# #
# # news_data = news_data.reset_index()
# df_merged = pd.merge(stock_data, news_data, on=["Date", "Stock"])
correlation = df_merged["Return"].corr(df_merged["Compound Score"])

print(f"Correlation between daily returns and sentiment scores: {correlation:.2f}")

Correlation between daily returns and sentiment scores: 0.19
