# Earnings Call Project: OpenAI
<br>
CIS 831 Deep Learning – Term Project<br>
Kansas State University
<br><br>
James Chapman<br>
John Woods<br>
Nathan Diehl<br>
<br>

### This notebook featurizes the text data from the earnings calls with OpenAI - ChatGPT 

OpenAI - ChatGPT  documentation can be found at https://platform.openai.com/docs/api-reference/introduction

The data from this notebook is stored in the "data/data_prep" directory as the following CSVs.
- OpenAI_sentiment
- MACE_OpenAI_sentiment


In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from tqdm import tqdm
import torch
import openai
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv() 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


cuda


In [2]:
MAEC_dir = 'data/MAEC/MAEC_Dataset' # https://github.com/Earnings-Call-Dataset/MAEC-A-Multimodal-Aligned-Earnings-Conference-Call-Dataset-for-Financial-Risk-Prediction

############# too big for GitHub ########################
############# stored on local disk ######################
original_data_dir = r"D:\original_dataset" # https://github.com/GeminiLn/EarningsCall_Dataset 
MAEC_audio_dir = r"D:\MAEC_audio" 
# there is a link for the audio data in the MAEC GitHub, but it does not work
# I emailed the authors, and they send another link.
# There is like a half-million files, but only 19 GB
# https://drive.google.com/file/d/1m1GRCHgKn9Vz9IFMC_SpCog6uP3-gFgY/view?usp=drive_link 

In [3]:
# Loop through the directory, each folder represents an earnings conference call; the folders are named as "CompanyName_Date".
filename_data = []
for filename in os.listdir(original_data_dir):
    company_name, date_str = filename.rsplit('_', 1)
    date_str = date_str.split('.')[0] 
    date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
    filename_data.append([company_name, date])
filename_data = pd.DataFrame(filename_data, columns=["Company", "Date"])
company_ticker = pd.read_csv('data/data_prep/company_ticker.csv')
filename_data = filename_data.merge(company_ticker, on="Company", how="left")

# Loop through the directory, each folder represents an earnings conference call; the folders are named as "Date_CompanyName".
MAEC_filename_data = []
for filename in os.listdir(MAEC_dir):
    date_str, ticker = filename.rsplit('_', 1)
    date_str = date_str.split('.')[0] 
    date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
    MAEC_filename_data.append([ticker, date])
MAEC_filename_data = pd.DataFrame(MAEC_filename_data, columns=["Ticker", "Date"])

In [4]:

client = OpenAI(api_key= os.getenv('First_Key') )

completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": "Write a haiku about recursion in programming."
        }
    ]
)

print(completion.choices[0].message)

ChatCompletionMessage(content='Code calls itself back,  \nLayers unfold like petals,  \nEndless depths of thought.  ', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None)


In [5]:
sentiment_list = ['Positive Outlook', 'Negative Outlook', 'Neutral/Factual', 'Cautiously Optimistic', 
                    'Concerned/Uncertain', 'Strong/Confident', 'Weak/Insecure', 'Growth-Oriented', 
                    'Cost-Conscious', 'Risk-Acknowledging']
prompt = """
Context:
You are a financial sentiment analyst. Your task is to analyze a sentence from an earnings conference call and classify it across ten sentiment categories. Each category is binary: assign a value of 1 if the category applies to the sentence and 0 otherwise.

Sentiment Categories:
1. Positive Outlook: Optimism about performance or growth potential.
2. Negative Outlook: Indications of risk, concern, or underperformance.
3. Neutral/Factual: Purely informational with no evaluative tone.
4. Cautiously Optimistic: Optimism balanced with acknowledgment of risks.
5. Concerned/Uncertain: Expressions of doubt or lack of clarity about the future.
6. Strong/Confident: Decisive language indicating leadership or control.
7. Weak/Insecure: Hesitant or non-committal tone suggesting insecurity.
8. Growth-Oriented: Focused on expansion, opportunities, or investments.
9. Cost-Conscious: Emphasis on cost management, efficiency, or budgeting.
10. Risk-Acknowledging: Recognition of challenges or uncertainties.

Instructions:
1. Evaluate each category independently.
2. Return only a list of 10 numbers (0 or 1), corresponding to the categories above, in order.
3. If a sentence applies to multiple categories (e.g., optimism and risk acknowledgment), assign 1 to all applicable categories.

Input Sentence:
{}

Output Format:
[List of 10 numbers]
"""


In [6]:
# Returns 10 floating-point values 
def get_sentiment(sentence):
    completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
            {"role": "system", "content": "You are a financial sentiment analyst."},
            {"role": "user", "content": prompt.format(sentence)}
        ],
        max_tokens=50,  
        temperature=0.0  # deterministic 
    )
    # send it
    result = completion.choices[0].message.content
    one_hot_encoding = list(map(int, result.strip("[]").split(",")))
    return one_hot_encoding

In [None]:
sentiment_errors = []
OpenAI_sentiment = []

for Company, Date in tqdm(filename_data[['Company', 'Date']].values):
    Date = Date.replace('-', '')
    text_path = f"D:/original_dataset/{Company}_{Date}/TextSequence.txt"
    try:
        with open(text_path, 'r', encoding='utf-8', errors='replace') as file:
            for i, line in enumerate(file, start=1):
                one_hot_encoding = get_sentiment(line.strip())
                features_row = np.concatenate([one_hot_encoding, [Company, Date, i]])
                OpenAI_sentiment.append(features_row)
    except KeyboardInterrupt: break
    except Exception as e:
        print((Company, Date, str(e)))
        sentiment_errors.append((Company, Date, str(e)))
OpenAI_sentiment = np.array(OpenAI_sentiment, dtype=object)
OpenAI_sentiment = pd.DataFrame(OpenAI_sentiment, columns= sentiment_list + ['Company', 'Date', 'Sentence_num'])
OpenAI_sentiment.info(verbose=False)

 10%|▉         | 39/394 [18:41:16<179:25:09, 1819.46s/it]

In [None]:
OpenAI_sentiment.describe()

Unnamed: 0,Positive Outlook,Negative Outlook,Neutral/Factual,Cautiously Optimistic,Concerned/Uncertain,Strong/Confident,Weak/Insecure,Growth-Oriented,Cost-Conscious,Risk-Acknowledging,Company,Date,Sentence_num
count,17105,17105,17105,17105,17105,17105,17105,17105,17105,17105,17105,17105,17105
unique,2,2,2,2,2,2,2,2,2,2,61,60,402
top,0,0,1,0,0,0,0,0,0,0,AmerisourceBergen Corp,20170427,1
freq,10735,15543,10816,15873,14237,13771,15680,11181,16431,14785,801,967,116


In [None]:
print(len(sentiment_errors))
sentiment_errors

64


[('Apache Corporation', '20170223', 'Connection error.'),
 ('Apache Corporation', '20171102', 'Connection error.'),
 ('Archer-Daniels-Midland Co', '20170207', 'Connection error.'),
 ('Archer-Daniels-Midland Co', '20170502', 'Connection error.'),
 ('Archer-Daniels-Midland Co', '20170801', 'Connection error.'),
 ('Archer-Daniels-Midland Co', '20171031', 'Connection error.'),
 ('AT&T Inc.', '20170425', 'Connection error.'),
 ('AT&T Inc.', '20171024', 'Connection error.'),
 ('Autodesk Inc.', '20170302', 'Connection error.'),
 ('Automatic Data Processing', '20170201', 'Connection error.'),
 ('Automatic Data Processing', '20170503', 'Connection error.'),
 ('Automatic Data Processing', '20170727', 'Connection error.'),
 ('Automatic Data Processing', '20171102', 'Connection error.'),
 ('Avery Dennison Corp', '20170201', 'Connection error.'),
 ('Avery Dennison Corp', '20170426', 'Connection error.'),
 ('Avery Dennison Corp', '20170725', 'Connection error.'),
 ('Avery Dennison Corp', '20171025',

In [None]:
###############################################
OpenAI_sentiment.to_csv(f'data/data_prep/OpenAI_sentiment.csv', index=False)
###############################################

In [None]:
sentiment_errors = pd.DataFrame(sentiment_errors)
###############################################
sentiment_errors.to_csv(f'data/data_prep/sentiment_errors.csv', index=False)
###############################################

In [None]:
MAEC_OpenAI_sentiment = []
MAEC_sentiment_errors = []
for Ticker, Date in tqdm(MAEC_filename_data[['Ticker', 'Date']].values):
    Date = Date.replace('-', '')
    text_path = f"D:/MAEC_audio/{Date}_{Ticker}/text.txt"
    try:
        with open(text_path, 'r', encoding='utf-8', errors='replace') as file:
            for i, line in enumerate(file, start=1):
                one_hot_encoding = get_sentiment(line.strip())
                features_row = np.concatenate([one_hot_encoding, [Ticker, Date, i]])
                MAEC_OpenAI_sentiment.append(features_row)
    except KeyboardInterrupt: break
    except Exception as e:
        print((Ticker, Date, str(e)))
        MAEC_sentiment_errors.append((Ticker, Date, str(e)))
MAEC_OpenAI_sentiment = np.array(MAEC_OpenAI_sentiment, dtype=object)
MAEC_OpenAI_sentiment = pd.DataFrame(MAEC_OpenAI_sentiment, columns= sentiment_list + ['Ticker', 'Date', 'Sentence_num'])
MAEC_OpenAI_sentiment.info(verbose=False)

In [None]:
MAEC_OpenAI_sentiment.describe()

In [None]:
print(len(MAEC_sentiment_errors))
MAEC_sentiment_errors

In [None]:
###############################################
MAEC_OpenAI_sentiment.to_csv(f'data/data_prep/MAEC_OpenAI_sentiment.csv', index=False)
###############################################

In [None]:
MAEC_sentiment_errors = pd.DataFrame(MAEC_sentiment_errors)
###############################################
MAEC_sentiment_errors.to_csv(f'data/data_prep/MAEC_sentiment_errors.csv', index=False)
###############################################

In [None]:
['Positive Outlook', 'Negative Outlook', 'Neutral/Factual', 'Cautiously Optimistic', 
 'Concerned/Uncertain', 'Strong/Confident', 'Weak/Insecure', 'Growth-Oriented', 
 'Cost-Conscious', 'Risk-Acknowledging']

['Positive Outlook', 'Negative Outlook', 'Neutral/Factual', 'Cautiously Optimistic', 
 'Concerned/Uncertain', 'Strong/Confident', 'Weak/Uncertain', 'Growth-Oriented', 
 'Cost-Conscious', 'Risk-Acknowledging']