# Import Libraries and Clone Github

In [1]:
# Import Libraries
import os
import sys
import time


In [2]:
# Determine the project directory from the current working directory
project_dir = os.path.abspath(os.path.join(os.getcwd(), '../..'))
source_code_dir = os.path.join(project_dir, '10_Source_Code')

# Add the path to the directory containing your module to the system path
sys.path.append(source_code_dir)

# Now you can import your module
import data_setup as ds
import gemini_setup as gs


In [3]:
# Global Variables
CATEGORIES = [
        "Finance",
        "Production",
        "Reserves / Exploration / Acquisitions / Mergers / Divestments",
        "Environment / Regulatory / Geopolitics",
        "Alternative Energy / Lower Carbon",
        "Oil Price / Natural Gas Price / Gasoline Price"]

SENTIMENT_RESULTS_FILE_PATH = 'Prompt1_Sentiment_Analysis_Results_CHECK.csv'

ROWS_TO_DROP = ['PQ-2840736837']

# Import Data

In [4]:
# Load and merge data using the function from data_setup
text_df = ds.load_cleaned_data()

In [5]:
# Display initial data
display(text_df.shape)
display(text_df.head())
display(text_df.tail())

(10053, 7)

Unnamed: 0,Source,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
0,Investment Research,IR-1,MRO,2024-05-16,Marathon Oil Corporation,"Stock Report | May 16, 2024 | NYSESymbol: MRO ...",
1,Investment Research,IR-2,EOG,2024-05-14,"EOG Resources, Inc.","Stock Report | May 14, 2024 | NYSESymbol: EOG ...",
2,Investment Research,IR-3,EOG,2024-05-11,"EOG Resources, Inc.","Stock Report | May 11, 2024 | NYSESymbol: EOG ...",
3,Investment Research,IR-4,DVN,2024-05-11,Devon Energy Corporation,"Stock Report | May 11, 2024 | NYSESymbol: DVN ...",
4,Investment Research,IR-5,COP,2024-05-07,ConocoPhillips,"Stock Report | May 07, 2024 | NYSESymbol:\n\nC...",


Unnamed: 0,Source,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
10048,Earnings Call Q&A,EQ-338,XOM,2021-02-02,"Exxon Mobil Corporation, Q4 2020 Earnings Call...",Question and Answer\nOperator\n[Operator Instr...,
10049,Earnings Call Q&A,EQ-339,COP,2021-02-02,"ConocoPhillips, Q4 2020 Earnings Call, Feb 02,...",Question and Answer\nOperator\n[Operator Instr...,
10050,Earnings Call Q&A,EQ-340,EOG,2019-05-03,"EOG Resources, Inc., Q1 2019 Earnings Call, Ma...",Question and Answer\nOperator\n[Operator Instr...,
10051,Earnings Call Q&A,EQ-341,SHEL,2019-05-02,"Royal Dutch Shell plc, Q1 2019 Earnings Call, ...",Question and Answer\nOperator\n[Operator Instr...,
10052,Earnings Call Q&A,EQ-342,COP,2019-04-30,"ConocoPhillips, Q1 2019 Earnings Call, Apr 30,...",Question and Answer\nOperator\n[Operator Instr...,


In [6]:
text_df = ds.drop_unprocessable_rows(text_df, ROWS_TO_DROP)
print(f"Dropped rows: {ROWS_TO_DROP}")

Dropped rows: ['PQ-2840736837']


In [7]:
# Check if sentiment analysis results file exists
file_exists = ds.check_file_exists(SENTIMENT_RESULTS_FILE_PATH)

if file_exists:
    print(f"The file exists in the current directory.")
else:
    print(f"The file does not exist in the current directory.")
    empty_sentiment_df = ds.create_empty_sentiment_df(text_df, CATEGORIES)
    ds.save_dataframe_to_csv(empty_sentiment_df, SENTIMENT_RESULTS_FILE_PATH)
    print(f"Created and saved an empty sentiment analysis DataFrame to {SENTIMENT_RESULTS_FILE_PATH}")

The file exists in the current directory.


### NOTE: If you want to re-run the sentiment analysis, delete or archive the csv to create a blank one

# Sentiment Analysis
NOTE: Google gemini **currently** has a daily query limit of 1,500 requests per day.  As we have over 10,000 documents, the code will be designed to run over multiple days and pick up where we last left off.  After the code is run, the user will still need to manually download the csv and upload to github.

**Gemini Free Rate Limits**
*   15 RPM (requests per minute)
*   1 million TPM (tokens per minute)
*   1,500 RPD (requests per day)

In [8]:
# Find the first unique ID with empty values
unique_id = ds.find_first_unique_id_with_empty_values(SENTIMENT_RESULTS_FILE_PATH, CATEGORIES)
print(unique_id)

IR-1


In [9]:
# Get Gemini inputs
company, source, headline, text = ds.get_model_inputs(text_df, unique_id)
print(f"Company: {company}\n")
print(f"Source: {source}\n")
print(f"Headline: {headline}\n")
print(f"Text:\n{text}")

Company: MRO

Source: Investment Research

Headline: Marathon Oil Corporation

Text:
Stock Report | May 16, 2024 | NYSESymbol: MRO | MRO is in the SGP 500
Marathon Oil Corporation

Recommendation Price 12-Mo. Target Price Report Currency
| HOLD | «| * | | USD 26.44 (as of market close May 15, 2024) USD 28.00 USD

Equity Analyst Jonnathan Handshoe

CFRA

Investment Style
Mid-Cap Blend

GICS Sector Energy
Sub-Industry Oil and Gas Exploration and Production

Summary Having spun off downstream assets in mid-2011, MRO is now an independent upstream
company with mainly U.S. operations, complemented by assets in Equatorial Guinea.

Key Stock Statistics (Source: CFRA, S&P Global Market Intelligence (SPGMI), Company Reports)

52-Wk Range USD 30.06 - 21.81 Oper.EPS2024E USD 2.82 = Market Capitalization[B] USD 14.91 Beta 2.23
Trailing 12-Month EPS USD 2.49  Oper.EPS2025E USD 3.20 = Yield [%] 1.66 = 3-yr Proj. EPS CAGR[%] 7
Trailing 12-Month P/E 10.62 P/E on Oper.EPS2024E 9.38 Dividend Rate/Share 

In [10]:
# Define the prompt template
prompt_template = f"""
Given the text from {{source}} about {{company}}, analyze the content and perform sentiment analysis across multiple predefined categories.

Sentiment options:
  - Positive
  - Neutral
  - Negative

Categories:
  - Finance
  - Production
  - Reserves / Exploration / Acquisitions / Mergers / Divestments
  - Environment / Regulatory / Geopolitics
  - Alternative Energy / Lower Carbon
  - Oil Price / Natural Gas Price / Gasoline Price

Each category should be evaluated and given a sentiment output derived from the text.
If a category is not mentioned or relevant based on the text content, mark it as 'Neutral'.

Before giving your answer, explain your reasoning and reference the article.
After going through all the categories, provide a summary in the following format:
- Category - Sentiment
- Category - Sentiment
- Category - Sentiment
- Category - Sentiment
- Category - Sentiment
- Category - Sentiment

Example Output:
- Finance - Positive
- Production - Neutral
- Reserves / Exploration / Acquisitions / Mergers / Divestments - Negative
- Environment / Regulatory / Geopolitics - Neutral
- Alternative Energy / Lower Carbon - Positive
- Oil Price / Natural Gas Price / Gasoline Price - Neutral

Make sure to use plain text, do not bold or bullet the output summary.

The text from {{source2}} is below:
{{headline}}
{{text}}

Remember to summarize your final answers in the following format exactly:
- Category - Sentiment
- Category - Sentiment
- Category - Sentiment
- Category - Sentiment
- Category - Sentiment
- Category - Sentiment

Make sure to use plain text and stick to the given categories and sentiment options.
DO NOT bold or bullet the output summary.
"""

In [11]:
# Create the actual prompt
text_source_dict = {
    "Investment Research": ("an analyst report", "the analyst report"),
    "ProQuest": ("a news article", "the news article"),
    "SEC Filings": ("an SEC filing", "the SEC filing"),
    "Earnings Call Presentations": ("an earnings call presentation", "the earnings call presentation"),
    "Earnings Call Q&A": ("an earnings call Q&A session", "the earnings call Q&A session")
}
text_source, text_source2 = text_source_dict.get(source, ("a source", "the source"))


In [12]:
prompt = prompt_template.format(source=text_source, source2=text_source2, company=company, headline=headline, text=text)

In [13]:
# Set up Gemini
model = gs.configure_gemini()

# Query Gemini
response = gs.query_gemini(prompt, model)
print(response)

# Parse sentiment
sentiment_dict = gs.parse_sentiment(response, CATEGORIES)
print(type(sentiment_dict))

Here's an analysis of the analyst report on MRO, categorized by sentiment:

**Reasoning:**

* **Finance:** The report highlights MRO's strong financial performance, including its dividend increase, share buybacks, and free cash flow generation. The analyst expects this trend to continue.  
* **Production:**  The report notes that MRO expects production to remain flat in 2024,  but this is not necessarily a negative sentiment. It reflects a focus on free cash flow over production growth, a common industry trend. 
* **Reserves / Exploration / Acquisitions / Mergers / Divestments:** The report mentions MRO's reserve life being below the peer average, leading to concerns about reserve replacement. While MRO has been active in acquiring acreage, the report also points out the decline in its net acreage held from 2022 to 2023. This suggests a mixed picture, making the sentiment slightly negative.
* **Environment / Regulatory / Geopolitics:** The report mentions potential risks related to reg

In [14]:
# Check if sentiment_dict is a dictionary before updating the CSV
if isinstance(sentiment_dict, dict):
    gs.update_csv(SENTIMENT_RESULTS_FILE_PATH, unique_id, sentiment_dict)
else:
    print("Error: Sentiment dictionary not found. Skipping update.")


Row with Unique_ID 'IR-1' has been updated.


  df.loc[row_index, category] = sentiment
  df.loc[row_index, category] = sentiment
  df.loc[row_index, category] = sentiment
  df.loc[row_index, category] = sentiment
  df.loc[row_index, category] = sentiment
  df.loc[row_index, category] = sentiment


In [16]:
# Main Loop
start_time = time.time()
unique_id = ds.find_first_unique_id_with_empty_values(SENTIMENT_RESULTS_FILE_PATH, CATEGORIES)
count = 0
max_tries = 5

while unique_id:
    retries = 0
    success = False

    while retries < max_tries and not success:
        try:
            company, source, headline, text = ds.get_model_inputs(text_df, unique_id)
            text_source, text_source2 = text_source_dict.get(source, ("a source", "the source"))
            prompt = prompt_template.format(source=text_source, source2=text_source2, company=company, headline=headline, text=text)
            response = gs.query_gemini(prompt, model)
            sentiment_dict = gs.parse_sentiment(response, CATEGORIES)

            # Check if sentiment_dict is a dictionary before updating the CSV
            if isinstance(sentiment_dict, dict):
                gs.update_csv(SENTIMENT_RESULTS_FILE_PATH, unique_id, sentiment_dict)
                success = True
            else:
                print("Error: Sentiment dictionary not found. Skipping update.")
                success = True  # To break the retry loop

        except Exception as e:
            retries += 1
            print(f"Error encountered: {e}. Retry {retries}/{max_tries}")

            if retries >= max_tries:
                print(f"Failed to process unique_id {unique_id} after {max_tries} attempts. Stopping.")
                unique_id = None
                break

    if not success:
        break

    count += 1
    if count % 10 == 0:
        elapsed_time = time.time() - start_time
        minutes, seconds = divmod(elapsed_time, 60)
        print(f"Iteration: {count}, Elapsed Time: {int(minutes)} minutes and {seconds:.2f} seconds")

    unique_id = ds.find_first_unique_id_with_empty_values(SENTIMENT_RESULTS_FILE_PATH, CATEGORIES)

Row with Unique_ID 'IR-2' has been updated.
Row with Unique_ID 'IR-3' has been updated.


KeyboardInterrupt: 