# Stock Modeling using FinBERT Sentiments

In [1]:
# import libraries
import os
from google.colab import userdata
from google.colab import output
import pandas as pd
import numpy as np
import yfinance as yf

In [2]:
#Delete repo clone if needed
#!rm -rf /content/CVX_Rice_project/

In [3]:
# Import github token with google secrets thingy and clone git repository
GITHUB_TOKEN = userdata.get('github')
os.environ['GITHUB_TOKEN'] = GITHUB_TOKEN
!git clone https://{GITHUB_TOKEN}@github.com/Kussil/Financial_Sentiment_LLM.git

fatal: destination path 'Financial_Sentiment_LLM' already exists and is not an empty directory.


In [4]:
# Read in outputs from FINBERT Sentiment Analysis
df_chunk = pd.read_csv('/content/Financial_Sentiment_LLM/03_Sentiment_Analysis/sentiment_chunkdata.csv')
df_chunk = df_chunk.fillna('')
df = df_chunk.groupby(['Article Index','Ticker','Source','Date','Article Headline','URL'])[['Neutral', 'Positive', 'Negative']].mean().reset_index()
print(df.shape)
display(df.head())
print(df.isna().sum())

(8604, 9)


Unnamed: 0,Article Index,Ticker,Source,Date,Article Headline,URL,Neutral,Positive,Negative
0,0,MRO,Investment Research,2024-05-16,Marathon Oil Corporation,,0.143564,0.413686,0.44275
1,1,EOG,Investment Research,2024-05-14,"EOG Resources, Inc.",,0.164021,0.042301,0.793678
2,2,EOG,Investment Research,2024-05-11,"EOG Resources, Inc.",,0.138396,0.089975,0.77163
3,3,DVN,Investment Research,2024-05-11,Devon Energy Corporation,,0.047985,0.468556,0.483459
4,4,COP,Investment Research,2024-05-07,ConocoPhillips,,0.266492,0.181206,0.552303


Article Index       0
Ticker              0
Source              0
Date                0
Article Headline    0
URL                 0
Neutral             0
Positive            0
Negative            0
dtype: int64


In [5]:
#Remove non stock Ticker values
df.drop(df[df['Ticker'] == 'BP.'].index, inplace=True)
df.drop(df[df['Ticker'] == 'Concho Resources Inc.'].index, inplace=True)
df.drop(df[df['Ticker'] == 'PDCE'].index, inplace=True)
df.drop(df[df['Ticker'] == 'TTL'].index, inplace=True)
df.drop(df[df['Ticker'] == 'CXO'].index, inplace=True)
ticker_list = df['Ticker'].unique().tolist()
print(ticker_list)

['MRO', 'EOG', 'DVN', 'COP', 'PXD', 'CVX', 'MPC', 'HES', 'PSX', 'XOM', 'SHEL', 'BP', 'OXY', 'VLO', 'TTE', 'EQNR']


In [6]:
#Identify minimum article date for stock price download
df['Date'] = pd.to_datetime(df['Date'])
print(df['Date'].dtype)

min_date = df['Date'].min()
print(min_date)

datetime64[ns]
2019-02-01 00:00:00


In [7]:
#Pull YahooFinance stock data to get Returns (ret)
ticker = ticker_list #ticker_list
price = yf.download(ticker, start=min_date)["Adj Close"]
ret = price.pct_change().dropna()
ret.name = "ret"
print(ret.shape)
print()
display(ret.head())
print()
display(ret.describe())

[*********************100%%**********************]  16 of 16 completed


(1360, 16)



Ticker,BP,COP,CVX,DVN,EOG,EQNR,HES,MPC,MRO,OXY,PSX,PXD,SHEL,TTE,VLO,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2019-02-04,0.00121,0.0182,0.011574,0.005553,0.005362,0.013467,0.00018,0.006721,0.009375,-0.004409,0.005838,0.010909,0.009071,0.003616,-0.005512,-0.014489
2019-02-05,0.034549,-0.009581,-0.002171,-0.011046,-0.010364,0.006001,-0.022633,0.019269,-0.013003,-0.006496,0.004327,-0.006779,0.004258,0.006846,0.013799,0.010292
2019-02-06,0.005138,-0.002166,-0.005022,-0.002978,-0.022064,-0.01193,0.009741,-0.004615,-0.001882,-0.009955,-0.007881,-0.012536,0.002512,-0.004831,-0.000465,-0.004366
2019-02-07,-0.0079,-0.023585,-0.005468,-0.055265,-0.033167,-0.026736,-0.046232,-0.047405,-0.040226,-0.025664,-0.028384,-0.035264,-0.015664,-0.019058,-0.023394,-0.007707
2019-02-08,-0.004918,-0.013621,-0.005498,-0.01502,-0.005269,0.001329,0.001336,-0.015385,-0.009168,0.000616,0.019403,-0.007895,0.000955,0.010997,0.002741,0.001625





Ticker,BP,COP,CVX,DVN,EOG,EQNR,HES,MPC,MRO,OXY,PSX,PXD,SHEL,TTE,VLO,XOM
count,1360.0,1360.0,1360.0,1360.0,1360.0,1360.0,1360.0,1360.0,1360.0,1360.0,1360.0,1360.0,1360.0,1360.0,1360.0,1360.0
mean,0.000385,0.00089,0.000619,0.001301,0.00079,0.000705,0.001248,0.001289,0.001168,0.00082,0.000801,0.001083,0.000534,0.000617,0.00106,0.000727
std,0.022961,0.027255,0.021691,0.036381,0.029427,0.023942,0.03043,0.029187,0.03613,0.038479,0.025919,0.028909,0.022619,0.021105,0.029648,0.020978
min,-0.19104,-0.2484,-0.221248,-0.373972,-0.320072,-0.210562,-0.336685,-0.270089,-0.468521,-0.520138,-0.158658,-0.369197,-0.171722,-0.178209,-0.192209,-0.122248
25%,-0.010005,-0.011836,-0.00852,-0.016577,-0.014015,-0.011482,-0.014394,-0.012127,-0.016939,-0.015045,-0.010644,-0.013211,-0.009721,-0.009777,-0.012708,-0.010325
50%,0.0,-0.000113,0.000738,0.000554,-7.9e-05,0.0,0.00108,0.001804,0.000642,-0.000169,0.000785,0.0,0.00054,0.001353,0.000464,0.000357
75%,0.009781,0.014049,0.009501,0.017571,0.01455,0.013576,0.01514,0.014862,0.017434,0.015125,0.013641,0.014033,0.010657,0.010434,0.015726,0.011262
max,0.216053,0.252139,0.227407,0.210721,0.165703,0.133042,0.203154,0.206286,0.232445,0.336977,0.221722,0.204343,0.196795,0.152756,0.312025,0.126868


In [8]:
# Get Sentiment based on max score
df_sentiment = df.copy()
df_sentiment['Sentiment'] = df_sentiment[['Neutral', 'Positive', 'Negative']].idxmax(axis=1)
display(df_sentiment.head())

Unnamed: 0,Article Index,Ticker,Source,Date,Article Headline,URL,Neutral,Positive,Negative,Sentiment
0,0,MRO,Investment Research,2024-05-16,Marathon Oil Corporation,,0.143564,0.413686,0.44275,Negative
1,1,EOG,Investment Research,2024-05-14,"EOG Resources, Inc.",,0.164021,0.042301,0.793678,Negative
2,2,EOG,Investment Research,2024-05-11,"EOG Resources, Inc.",,0.138396,0.089975,0.77163,Negative
3,3,DVN,Investment Research,2024-05-11,Devon Energy Corporation,,0.047985,0.468556,0.483459,Negative
4,4,COP,Investment Research,2024-05-07,ConocoPhillips,,0.266492,0.181206,0.552303,Negative


In [9]:
# reshape ret same day
ret_stack = ret.stack().reset_index()
ret_stack.columns = ['Date', 'Ticker', 'Returns']
ret_stack['Date'] = pd.to_datetime(ret_stack['Date'])
print(ret_stack['Date'].dtype)
display(ret_stack.head())

datetime64[ns]


Unnamed: 0,Date,Ticker,Returns
0,2019-02-04,BP,0.00121
1,2019-02-04,COP,0.0182
2,2019-02-04,CVX,0.011574
3,2019-02-04,DVN,0.005553
4,2019-02-04,EOG,0.005362


In [10]:
# merge in returns for the day from YahooFinance
df_merged = pd.merge(df_sentiment, ret_stack, on=['Ticker', 'Date'], how='left')
df_merged = df_merged.sort_values(['Ticker', 'Date'])
display(df_merged.head())
print(df_merged.isna().sum())

Unnamed: 0,Article Index,Ticker,Source,Date,Article Headline,URL,Neutral,Positive,Negative,Sentiment,Returns
4442,4890,BP,Investment Research,2019-02-20,Reaffirming BUY and $52 price target,,0.360758,0.294953,0.344289,Neutral,0.00236
4434,4882,BP,Investment Research,2019-05-06,Reiterating BUY and $52 price target,,0.037869,0.626881,0.335249,Positive,-0.003482
6951,7399,BP,ProQuest,2019-05-22,BP bosses get a public grilling on climate fro...,https://www.proquest.com/newspapers/bp-bosses-...,0.087031,0.219284,0.693685,Negative,-0.009591
6952,7400,BP,ProQuest,2019-05-22,BP bosses get public grilling on climate from ...,https://www.proquest.com/newspapers/bp-bosses-...,0.090433,0.196855,0.712712,Negative,-0.009591
6950,7398,BP,ProQuest,2019-05-23,Saudi Aramco starts fuel trading from UAE with...,https://www.proquest.com/newspapers/saudi-aram...,0.181392,0.017759,0.800849,Negative,-0.023618


Article Index          0
Ticker                 0
Source                 0
Date                   0
Article Headline       0
URL                    0
Neutral                0
Positive               0
Negative               0
Sentiment              0
Returns             3930
dtype: int64


In [11]:
# Check out some rows with NA's
df_with_nas = df_merged[df_merged.isna().any(axis=1)]
df_with_nas['DayOfWeek'] = df_with_nas['Date'].dt.day_name()
display(df_with_nas.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_with_nas['DayOfWeek'] = df_with_nas['Date'].dt.day_name()


Unnamed: 0,Article Index,Ticker,Source,Date,Article Headline,URL,Neutral,Positive,Negative,Sentiment,Returns,DayOfWeek
6945,7393,BP,ProQuest,2019-06-22,BP PLC,https://www.proquest.com/newspapers/bp-plc/doc...,0.059458,0.031321,0.909221,Negative,,Saturday
6942,7390,BP,ProQuest,2019-07-13,'The best pension investment person on planet ...,https://www.proquest.com/newspapers/best-pensi...,0.140484,0.135576,0.72394,Negative,,Saturday
6943,7391,BP,ProQuest,2019-07-13,The Best pension investment person on planet e...,https://www.proquest.com/newspapers/best-pensi...,0.143789,0.135606,0.720605,Negative,,Saturday
6927,7375,BP,ProQuest,2019-08-17,Reliance Industries rating: Buy; deal to allay...,https://www.proquest.com/newspapers/reliance-i...,0.351313,0.00994,0.638747,Negative,,Saturday
6916,7364,BP,ProQuest,2019-09-01,Markets Cover Story: Reliance Great Gamble Wil...,https://www.proquest.com/newspapers/markets-co...,0.486902,0.107237,0.405861,Neutral,,Sunday


In [12]:
# Fill NaN values with the next day's value within the same 'Ticker'
df_merged['Returns'] = df_merged.groupby('Ticker')['Returns'].fillna(method='bfill')
display(df_merged.head())
print(df_merged.isna().sum())

Unnamed: 0,Article Index,Ticker,Source,Date,Article Headline,URL,Neutral,Positive,Negative,Sentiment,Returns
4442,4890,BP,Investment Research,2019-02-20,Reaffirming BUY and $52 price target,,0.360758,0.294953,0.344289,Neutral,0.00236
4434,4882,BP,Investment Research,2019-05-06,Reiterating BUY and $52 price target,,0.037869,0.626881,0.335249,Positive,-0.003482
6951,7399,BP,ProQuest,2019-05-22,BP bosses get a public grilling on climate fro...,https://www.proquest.com/newspapers/bp-bosses-...,0.087031,0.219284,0.693685,Negative,-0.009591
6952,7400,BP,ProQuest,2019-05-22,BP bosses get public grilling on climate from ...,https://www.proquest.com/newspapers/bp-bosses-...,0.090433,0.196855,0.712712,Negative,-0.009591
6950,7398,BP,ProQuest,2019-05-23,Saudi Aramco starts fuel trading from UAE with...,https://www.proquest.com/newspapers/saudi-aram...,0.181392,0.017759,0.800849,Negative,-0.023618


Article Index       0
Ticker              0
Source              0
Date                0
Article Headline    0
URL                 0
Neutral             0
Positive            0
Negative            0
Sentiment           0
Returns             4
dtype: int64


In [13]:
# Drop last 4 Nas
df_merged = df_merged.dropna()
print(df_merged.isna().sum())

Article Index       0
Ticker              0
Source              0
Date                0
Article Headline    0
URL                 0
Neutral             0
Positive            0
Negative            0
Sentiment           0
Returns             0
dtype: int64


In [15]:
# Save csv locally
df_merged.to_csv('01_FinBERT_Prepped_Stock_Data.csv', index=False)