# Stock Modeling using Gemini Sentiments

In [1]:
# import libraries
import os
from google.colab import userdata
from google.colab import output
import pandas as pd
import numpy as np
import yfinance as yf
from datetime import datetime

In [2]:
# Import github token with google secrets thingy and clone git repository
GITHUB_TOKEN = userdata.get('github')
os.environ['GITHUB_TOKEN'] = GITHUB_TOKEN
!git clone https://{GITHUB_TOKEN}@github.com/Kussil/Financial_Sentiment_LLM.git

Cloning into 'Financial_Sentiment_LLM'...
remote: Enumerating objects: 2340, done.[K
remote: Counting objects: 100% (745/745), done.[K
remote: Compressing objects: 100% (346/346), done.[K
remote: Total 2340 (delta 472), reused 647 (delta 386), pack-reused 1595[K
Receiving objects: 100% (2340/2340), 475.21 MiB | 17.08 MiB/s, done.
Resolving deltas: 100% (1555/1555), done.
Updating files: 100% (1170/1170), done.


In [3]:
# Read in outputs from Gemini Sentiment Analysis
gemini_prompt1_sentiment = pd.read_csv('/content/Financial_Sentiment_LLM/03_Sentiment_Analysis/Prompt2_Sentiment_Analysis_Results.csv')
gemini_prompt1_sentiment = gemini_prompt1_sentiment.fillna('')
print(gemini_prompt1_sentiment.shape)
display(gemini_prompt1_sentiment.head())

(10052, 11)


Unnamed: 0,Source,Unique_ID,Ticker,Date,URL,Finance,Production,Reserves / Exploration / Acquisitions / Mergers / Divestments,Environment / Regulatory / Geopolitics,Alternative Energy / Lower Carbon,Oil Price / Natural Gas Price / Gasoline Price
0,Investment Research,IR-1,MRO,5/16/2024,,Positive,Neutral,Positive,Negative,Neutral,Neutral
1,Investment Research,IR-2,EOG,5/14/2024,,Positive,Positive,Positive,Neutral,Positive,Neutral
2,Investment Research,IR-3,EOG,5/11/2024,,Positive,Positive,Positive,Neutral,Neutral,Neutral
3,Investment Research,IR-4,DVN,5/11/2024,,Positive,Positive,Negative,Neutral,Neutral,Neutral
4,Investment Research,IR-5,COP,5/7/2024,,Neutral,Neutral,Positive,Negative,Neutral,Positive


In [4]:
# Define a list of date formats
date_formats = ['%d/%m/%Y',  # 01/01/2021
                '%m/%d/%Y',  # 5/16/2024
                '%Y-%m-%d',  # 2021-02-28
                '%B %d, %Y',  # March 15, 2021
                '%b-%d-%Y',  # Apr-30-2019
                '%Y.%m.%d',  # 2021.04.10
                '%m-%d-%Y',  # 04-10-2021
                '%d-%b-%y',  # 17-Aug-22
                '%b-%y']  # Sep-19

# Function to parse dates with multiple formats
def parse_date(date_str):
    """
    Parse a date string using a list of possible date formats.

    This function attempts to parse a given date string using various date formats specified in `date_formats`.
    If the date string matches a format that only includes the month and year ('%b-%y'), it defaults the day to the first day of the month.

    Args:
        date_str (str): The date string to parse.

    Returns:
        datetime: The parsed date as a `datetime` object.

    Raises:
        ValueError: If the date string does not match any of the provided formats in `date_formats`.
    """
    for fmt in date_formats:
        try:
            # Handle special case where only month and year are given
            if fmt == '%b-%y':
                parsed_date = datetime.strptime(date_str, fmt)
                # Default to the first day of the month
                return parsed_date.replace(day=1)
            return datetime.strptime(date_str, fmt)
        except ValueError:
            continue
    raise ValueError(f"Date format not recognized: {date_str}")

# Apply the function to the 'Date' column
gemini_prompt1_sentiment['Date'] = gemini_prompt1_sentiment['Date'].apply(parse_date)

# Format the dates in a specific string format and convert to datetime
gemini_prompt1_sentiment['Date'] = gemini_prompt1_sentiment['Date'].dt.strftime('%Y-%m-%d')
gemini_prompt1_sentiment['Date'] = pd.to_datetime(gemini_prompt1_sentiment['Date'])

# Display the DataFrame
print(gemini_prompt1_sentiment.shape)
print(gemini_prompt1_sentiment['Date'].dtype)
display(gemini_prompt1_sentiment.head())
print(gemini_prompt1_sentiment.isna().sum())

(10052, 11)
datetime64[ns]


Unnamed: 0,Source,Unique_ID,Ticker,Date,URL,Finance,Production,Reserves / Exploration / Acquisitions / Mergers / Divestments,Environment / Regulatory / Geopolitics,Alternative Energy / Lower Carbon,Oil Price / Natural Gas Price / Gasoline Price
0,Investment Research,IR-1,MRO,2024-05-16,,Positive,Neutral,Positive,Negative,Neutral,Neutral
1,Investment Research,IR-2,EOG,2024-05-14,,Positive,Positive,Positive,Neutral,Positive,Neutral
2,Investment Research,IR-3,EOG,2024-11-05,,Positive,Positive,Positive,Neutral,Neutral,Neutral
3,Investment Research,IR-4,DVN,2024-11-05,,Positive,Positive,Negative,Neutral,Neutral,Neutral
4,Investment Research,IR-5,COP,2024-07-05,,Neutral,Neutral,Positive,Negative,Neutral,Positive


Source                                                           0
Unique_ID                                                        0
Ticker                                                           0
Date                                                             0
URL                                                              0
Finance                                                          0
Production                                                       0
Reserves / Exploration / Acquisitions / Mergers / Divestments    0
Environment / Regulatory / Geopolitics                           0
Alternative Energy / Lower Carbon                                0
Oil Price / Natural Gas Price / Gasoline Price                   0
dtype: int64


In [5]:
# Renaming the columns to simplify sentiment categories
gemini_prompt1_sentiment.rename(columns={
    'Reserves / Exploration / Acquisitions / Mergers / Divestments': 'Reserves/M&A',
    'Environment / Regulatory / Geopolitics': 'Regulatory',
    'Alternative Energy / Lower Carbon': 'Green Energy',
    'Oil Price / Natural Gas Price / Gasoline Price': 'OG Price'}
                                , inplace=True)

display(gemini_prompt1_sentiment.head())

Unnamed: 0,Source,Unique_ID,Ticker,Date,URL,Finance,Production,Reserves/M&A,Regulatory,Green Energy,OG Price
0,Investment Research,IR-1,MRO,2024-05-16,,Positive,Neutral,Positive,Negative,Neutral,Neutral
1,Investment Research,IR-2,EOG,2024-05-14,,Positive,Positive,Positive,Neutral,Positive,Neutral
2,Investment Research,IR-3,EOG,2024-11-05,,Positive,Positive,Positive,Neutral,Neutral,Neutral
3,Investment Research,IR-4,DVN,2024-11-05,,Positive,Positive,Negative,Neutral,Neutral,Neutral
4,Investment Research,IR-5,COP,2024-07-05,,Neutral,Neutral,Positive,Negative,Neutral,Positive


In [6]:
#Identify minimum article date for stock price download
min_date = gemini_prompt1_sentiment['Date'].min()
print(min_date)

2019-01-02 00:00:00


In [7]:
ticker_list = gemini_prompt1_sentiment['Ticker'].unique().tolist()
print(ticker_list)

['MRO', 'EOG', 'DVN', 'COP', 'PXD', 'PDCE', 'CXO', 'CVX', 'MPC', 'HES', 'PSX', 'XOM', 'SHEL', 'BP', 'OXY', 'VLO', 'TTE', 'EQNR']


In [8]:
#Pull YahooFinance stock data to get Returns (ret)
all_data = {}

for ticker in ticker_list:
    try:
        # Get the last date for the current ticker from gemini_prompt1_sentiment
        last_date = gemini_prompt1_sentiment[gemini_prompt1_sentiment['Ticker'] == ticker]['Date'].max()
        if pd.isna(last_date):
            print(f"No data available for {ticker} in gemini_prompt1_sentiment.")
            continue

        # Ensure the date is in the correct format
        last_date = pd.to_datetime(last_date).strftime('%Y-%m-%d')
        print(f"Downloading data for {ticker} from {min_date} to {last_date}")

        # Download stock data
        data = yf.download(ticker, start=min_date, end=last_date)["Adj Close"]

        if data.empty:
            print(f"No data retrieved for {ticker}")
        else:
            all_data[ticker] = data

    except Exception as e:
        print(f"Error downloading data for {ticker}: {e}")

# Combine all data into a single DataFrame
if all_data:
    price_df = pd.DataFrame(all_data)
    ret_df = price_df.pct_change().dropna()
    ret_df.name = "ret"
    ret_df.describe()
else:
    print("No data available.")

print()
print(ret_df.shape)
print()
display(ret_df.head())
print()
display(ret_df.describe())

Downloading data for MRO from 2019-01-02 00:00:00 to 2024-10-02


[*********************100%%**********************]  1 of 1 completed


Downloading data for EOG from 2019-01-02 00:00:00 to 2024-11-05


[*********************100%%**********************]  1 of 1 completed


Downloading data for DVN from 2019-01-02 00:00:00 to 2024-11-05


[*********************100%%**********************]  1 of 1 completed


Downloading data for COP from 2019-01-02 00:00:00 to 2024-12-02


[*********************100%%**********************]  1 of 1 completed


Downloading data for PXD from 2019-01-02 00:00:00 to 2024-10-02


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['PDCE']: YFInvalidPeriodError("%ticker%: Period 'max' is invalid, must be one of ['1d', '5d']")


Downloading data for PDCE from 2019-01-02 00:00:00 to 2023-11-08
No data retrieved for PDCE
Downloading data for CXO from 2019-01-02 00:00:00 to 2021-10-02


[*********************100%%**********************]  1 of 1 completed
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['CXO']: YFTzMissingError('$%ticker%: possibly delisted; No timezone found')


No data retrieved for CXO
Downloading data for CVX from 2019-01-02 00:00:00 to 2024-11-05


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


Downloading data for MPC from 2019-01-02 00:00:00 to 2024-10-02
Downloading data for HES from 2019-01-02 00:00:00 to 2024-12-02


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


Downloading data for PSX from 2019-01-02 00:00:00 to 2024-12-03
Downloading data for XOM from 2019-01-02 00:00:00 to 2024-11-05


[*********************100%%**********************]  1 of 1 completed


Downloading data for SHEL from 2019-01-02 00:00:00 to 2024-11-05


[*********************100%%**********************]  1 of 1 completed


Downloading data for BP from 2019-01-02 00:00:00 to 2024-11-05


[*********************100%%**********************]  1 of 1 completed


Downloading data for OXY from 2019-01-02 00:00:00 to 2024-12-03


[*********************100%%**********************]  1 of 1 completed


Downloading data for VLO from 2019-01-02 00:00:00 to 2024-10-02


[*********************100%%**********************]  1 of 1 completed


Downloading data for TTE from 2019-01-02 00:00:00 to 2024-12-02


[*********************100%%**********************]  1 of 1 completed


Downloading data for EQNR from 2019-01-02 00:00:00 to 2024-12-02


[*********************100%%**********************]  1 of 1 completed


(1382, 16)






Unnamed: 0_level_0,MRO,EOG,DVN,COP,PXD,CVX,MPC,HES,PSX,XOM,SHEL,BP,OXY,VLO,TTE,EQNR
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2019-01-03,-0.006793,-0.007063,0.013356,-0.018933,0.003201,-0.019153,-0.014588,0.003551,-0.013845,-0.015353,-0.002191,0.005701,-0.012258,-0.017999,0.002273,0.002313
2019-01-04,0.054036,0.043904,0.044643,0.025249,0.033845,0.020724,0.049005,0.077848,0.045339,0.03687,0.022631,0.031435,0.032169,0.037871,0.029295,0.028611
2019-01-07,0.013627,0.006282,0.031746,-0.004392,0.021754,0.012994,0.009894,0.047932,0.006936,0.0052,0.001321,0.003248,0.020092,0.001039,-0.001836,0.008524
2019-01-08,-0.008963,0.010686,0.00789,0.013392,-0.000843,-0.004365,0.009798,0.012531,-0.000874,0.007271,-0.006103,-0.003237,0.007909,0.003503,-0.003127,0.004448
2019-01-09,0.038114,0.038421,0.010959,0.033116,0.025387,0.01342,0.021473,0.034653,0.006565,0.005275,0.011616,0.008244,0.012156,-0.000905,0.02196,0.015057





Unnamed: 0,MRO,EOG,DVN,COP,PXD,CVX,MPC,HES,PSX,XOM,SHEL,BP,OXY,VLO,TTE,EQNR
count,1382.0,1382.0,1382.0,1382.0,1382.0,1382.0,1382.0,1382.0,1382.0,1382.0,1382.0,1382.0,1382.0,1382.0,1382.0,1382.0
mean,0.001213,0.000846,0.001396,0.000937,0.001113,0.000661,0.001342,0.001435,0.000839,0.000781,0.000575,0.000433,0.000875,0.00114,0.000661,0.000743
std,0.035919,0.029279,0.036186,0.02713,0.028736,0.021585,0.029031,0.03039,0.025775,0.020877,0.022499,0.022818,0.038211,0.029535,0.021005,0.023809
min,-0.468521,-0.320072,-0.373971,-0.248401,-0.369197,-0.221248,-0.270089,-0.336685,-0.158658,-0.122248,-0.171722,-0.19104,-0.520138,-0.192209,-0.178208,-0.210562
25%,-0.016742,-0.013894,-0.016483,-0.011787,-0.013105,-0.00845,-0.011961,-0.01413,-0.010578,-0.010177,-0.009655,-0.009927,-0.014841,-0.012676,-0.009731,-0.011335
50%,0.000642,6.2e-05,0.000841,0.0,2.5e-05,0.000746,0.001822,0.00113,0.00082,0.000487,0.000602,0.0,-7.9e-05,0.000521,0.001373,0.0
75%,0.017345,0.014434,0.017626,0.014061,0.013897,0.009527,0.014865,0.015279,0.013572,0.011219,0.010659,0.009766,0.015092,0.015778,0.010442,0.013551
max,0.232445,0.165703,0.210721,0.252138,0.204343,0.227407,0.206286,0.203153,0.221722,0.126868,0.196795,0.216053,0.336977,0.312025,0.152756,0.133043


In [9]:
# reshape ret same day
ret_stack = ret_df.stack().reset_index()
ret_stack.columns = ['Date', 'Ticker', 'Returns']
ret_stack['Date'] = pd.to_datetime(ret_stack['Date'])
print(ret_stack['Date'].dtype)
display(ret_stack.head())

datetime64[ns]


Unnamed: 0,Date,Ticker,Returns
0,2019-01-03,MRO,-0.006793
1,2019-01-03,EOG,-0.007063
2,2019-01-03,DVN,0.013356
3,2019-01-03,COP,-0.018933
4,2019-01-03,PXD,0.003201


In [10]:
# merge in returns for the day from YahooFinance
df_merged = pd.merge(gemini_prompt1_sentiment, ret_stack, on=['Ticker', 'Date'], how='left')
df_merged = df_merged.sort_values(['Ticker', 'Date'])
display(df_merged.head())
print(df_merged.isna().sum())

Unnamed: 0,Source,Unique_ID,Ticker,Date,URL,Finance,Production,Reserves/M&A,Regulatory,Green Energy,OG Price,Returns
7303,ProQuest,PQ-2463687415,BP,2019-01-09,https://www.proquest.com/newspapers/markets-co...,Positive,Neutral,Positive,Neutral,Neutral,Neutral,0.008244
7281,ProQuest,PQ-2311280728,BP,2019-02-11,https://www.proquest.com/newspapers/arabia-sau...,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral,-0.006589
4874,Investment Research,IR-4891,BP,2019-02-20,,Positive,Positive,Neutral,Neutral,Neutral,Positive,0.00236
7295,ProQuest,PQ-2300507429,BP,2019-03-10,https://www.proquest.com/newspapers/bp-ceo-ste...,Positive,Neutral,Neutral,Negative,Neutral,Neutral,
7280,ProQuest,PQ-2311444292,BP,2019-03-11,https://www.proquest.com/newspapers/aramco-arr...,Positive,Neutral,Neutral,Neutral,Negative,Neutral,0.001894


Source             0
Unique_ID          0
Ticker             0
Date               0
URL                0
Finance            0
Production         0
Reserves/M&A       0
Regulatory         0
Green Energy       0
OG Price           0
Returns         4219
dtype: int64


In [11]:
# Check out some rows with NA's
df_with_nas = df_merged[df_merged.isna().any(axis=1)]
df_with_nas['DayOfWeek'] = df_with_nas['Date'].dt.day_name()
display(df_with_nas.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_with_nas['DayOfWeek'] = df_with_nas['Date'].dt.day_name()


Unnamed: 0,Source,Unique_ID,Ticker,Date,URL,Finance,Production,Reserves/M&A,Regulatory,Green Energy,OG Price,Returns,DayOfWeek
7295,ProQuest,PQ-2300507429,BP,2019-03-10,https://www.proquest.com/newspapers/bp-ceo-ste...,Positive,Neutral,Neutral,Negative,Neutral,Neutral,,Sunday
7322,ProQuest,PQ-2463580737,BP,2019-06-08,https://www.proquest.com/newspapers/reliance-b...,Positive,Neutral,Positive,Neutral,Neutral,Neutral,,Saturday
7323,ProQuest,PQ-2268739251,BP,2019-06-08,https://www.proquest.com/newspapers/reliance-b...,Positive,Positive,Positive,Neutral,Neutral,Neutral,,Saturday
7324,ProQuest,PQ-2268592372,BP,2019-06-08,https://www.proquest.com/newspapers/rils-5-500...,Positive,Neutral,Positive,Neutral,Neutral,Neutral,,Saturday
8514,SEC Filings,SEC-114036119010687,BP,2019-07-06,https://www.sec.gov/Archives/edgar/data/000031...,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral,,Saturday


In [12]:
# Fill NaN values with the next day's value within the same 'Ticker'
df_merged['Returns'] = df_merged.groupby('Ticker')['Returns'].fillna(method='bfill')
display(df_merged.head())
print(df_merged.isna().sum())

Unnamed: 0,Source,Unique_ID,Ticker,Date,URL,Finance,Production,Reserves/M&A,Regulatory,Green Energy,OG Price,Returns
7303,ProQuest,PQ-2463687415,BP,2019-01-09,https://www.proquest.com/newspapers/markets-co...,Positive,Neutral,Positive,Neutral,Neutral,Neutral,0.008244
7281,ProQuest,PQ-2311280728,BP,2019-02-11,https://www.proquest.com/newspapers/arabia-sau...,Neutral,Neutral,Neutral,Neutral,Neutral,Neutral,-0.006589
4874,Investment Research,IR-4891,BP,2019-02-20,,Positive,Positive,Neutral,Neutral,Neutral,Positive,0.00236
7295,ProQuest,PQ-2300507429,BP,2019-03-10,https://www.proquest.com/newspapers/bp-ceo-ste...,Positive,Neutral,Neutral,Negative,Neutral,Neutral,0.001894
7280,ProQuest,PQ-2311444292,BP,2019-03-11,https://www.proquest.com/newspapers/aramco-arr...,Positive,Neutral,Neutral,Neutral,Negative,Neutral,0.001894


Source            0
Unique_ID         0
Ticker            0
Date              0
URL               0
Finance           0
Production        0
Reserves/M&A      0
Regulatory        0
Green Energy      0
OG Price          0
Returns         578
dtype: int64


In [13]:
# Drop last Nas
df_merged = df_merged.dropna()
print(df_merged.isna().sum())

Source          0
Unique_ID       0
Ticker          0
Date            0
URL             0
Finance         0
Production      0
Reserves/M&A    0
Regulatory      0
Green Energy    0
OG Price        0
Returns         0
dtype: int64


In [14]:
# save csv locally
df_merged.to_csv('02_Prompt2_Gemini_Prepped_Stock_Data.csv', index=False)