# Earnings Call Project: OpenAI
<br>
CIS 831 Deep Learning – Term Project<br>
Kansas State University
<br><br>
James Chapman<br>
John Woods<br>
Nathan Diehl<br>
<br>

### This notebook featurizes the text data from the earnings calls with OpenAI - ChatGPT 

OpenAI - ChatGPT  documentation can be found at https://platform.openai.com/docs/api-reference/introduction

The data from this notebook is stored in the "data/data_prep" directory as the following CSVs.
- OpenAI_sentiment
- MACE_OpenAI_sentiment


In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from tqdm import tqdm
import torch
import openai
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv() 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


cuda


In [2]:
MAEC_dir = 'data/MAEC/MAEC_Dataset' # https://github.com/Earnings-Call-Dataset/MAEC-A-Multimodal-Aligned-Earnings-Conference-Call-Dataset-for-Financial-Risk-Prediction

############# too big for GitHub ########################
############# stored on local disk ######################
original_data_dir = r"D:\original_dataset" # https://github.com/GeminiLn/EarningsCall_Dataset 
MAEC_audio_dir = r"D:\MAEC_audio" 
# there is a link for the audio data in the MAEC GitHub, but it does not work
# I emailed the authors, and they send another link.
# There is like a half-million files, but only 19 GB
# https://drive.google.com/file/d/1m1GRCHgKn9Vz9IFMC_SpCog6uP3-gFgY/view?usp=drive_link 

In [3]:
# Loop through the directory, each folder represents an earnings conference call; the folders are named as "CompanyName_Date".
filename_data = []
for filename in os.listdir(original_data_dir):
    company_name, date_str = filename.rsplit('_', 1)
    date_str = date_str.split('.')[0] 
    date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
    filename_data.append([company_name, date])
filename_data = pd.DataFrame(filename_data, columns=["Company", "Date"])
company_ticker = pd.read_csv('data/data_prep/company_ticker.csv')
filename_data = filename_data.merge(company_ticker, on="Company", how="left")

# Loop through the directory, each folder represents an earnings conference call; the folders are named as "Date_CompanyName".
MAEC_filename_data = []
for filename in os.listdir(MAEC_dir):
    date_str, ticker = filename.rsplit('_', 1)
    date_str = date_str.split('.')[0] 
    date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
    MAEC_filename_data.append([ticker, date])
MAEC_filename_data = pd.DataFrame(MAEC_filename_data, columns=["Ticker", "Date"])

In [4]:

client = OpenAI(api_key= os.getenv('First_Key') )

completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": "Write a haiku about recursion in programming."
        }
    ]
)

print(completion.choices[0].message)

ChatCompletionMessage(content='Function calls itself,  \nLayers deep in logic’s dance—  \nEndless loop of thought.', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None)


In [5]:
sentiment_list = ['Positive Outlook', 'Negative Outlook', 'Neutral/Factual', 'Cautiously Optimistic', 
                    'Concerned/Uncertain', 'Strong/Confident', 'Weak/Insecure', 'Growth-Oriented', 
                    'Cost-Conscious', 'Risk-Acknowledging']
prompt = """
Context:
You are a financial sentiment analyst. Your task is to analyze a sentence from an earnings conference call and classify it across ten sentiment categories. Each category is binary: assign a value of 1 if the category applies to the sentence and 0 otherwise.

Sentiment Categories:
1. Positive Outlook: Optimism about performance or growth potential.
2. Negative Outlook: Indications of risk, concern, or underperformance.
3. Neutral/Factual: Purely informational with no evaluative tone.
4. Cautiously Optimistic: Optimism balanced with acknowledgment of risks.
5. Concerned/Uncertain: Expressions of doubt or lack of clarity about the future.
6. Strong/Confident: Decisive language indicating leadership or control.
7. Weak/Insecure: Hesitant or non-committal tone suggesting insecurity.
8. Growth-Oriented: Focused on expansion, opportunities, or investments.
9. Cost-Conscious: Emphasis on cost management, efficiency, or budgeting.
10. Risk-Acknowledging: Recognition of challenges or uncertainties.

Instructions:
1. Evaluate each category independently.
2. Return only a list of 10 numbers (0 or 1), corresponding to the categories above, in order.
3. If a sentence applies to multiple categories (e.g., optimism and risk acknowledgment), assign 1 to all applicable categories.

Input Sentence:
{}

Output Format:
[List of 10 numbers]
"""


In [6]:
# Returns 10 floating-point values 
def get_sentiment(sentence):
    completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
            {"role": "system", "content": "You are a financial sentiment analyst."},
            {"role": "user", "content": prompt.format(sentence)}
        ],
        max_tokens=50,  
        temperature=0.0  # deterministic 
    )
    # Extract the output text
    result = completion.choices[0].message.content

    # Parse the output into a list of integers
    try:
        one_hot_encoding = list(map(int, result.strip("[]").split(",")))
    except ValueError:
        raise ValueError(f"Unexpected output format: {result}")
    
    return one_hot_encoding



In [7]:

OpenAI_sentiment = []
sentiment_errors = []
for Company, Date in tqdm(filename_data[['Company', 'Date']].values):
    Date = Date.replace('-', '')
    text_path = f"D:/original_dataset/{Company}_{Date}/TextSequence.txt"
    try:
        with open(text_path, 'r', encoding='utf-8', errors='replace') as file:
            for i, line in enumerate(file, start=1):
                one_hot_encoding = get_sentiment(line.strip())
                features_row = np.concatenate([one_hot_encoding, [Company, Date, i]])
                OpenAI_sentiment.append(features_row)
    except KeyboardInterrupt: break
    except Exception as e:
        print((Company, Date, str(e)))
        sentiment_errors.append((Company, Date, str(e)))
OpenAI_sentiment = np.array(OpenAI_sentiment, dtype=object)
OpenAI_sentiment = pd.DataFrame(OpenAI_sentiment, columns= sentiment_list + ['Company', 'Date', 'Sentence_num'])
OpenAI_sentiment.info(verbose=False)
###############################################
OpenAI_sentiment.to_csv(f'data/data_prep/OpenAI_sentiment.csv', index=False)
###############################################


 10%|█         | 60/572 [2:01:32<25:37:04, 180.13s/it]

('Apache Corporation', '20170223', 'Connection error.')


 11%|█         | 61/572 [2:01:33<17:57:31, 126.52s/it]

('Apache Corporation', '20171102', 'Connection error.')


 11%|█         | 62/572 [2:01:35<12:36:18, 88.98s/it] 

('Archer-Daniels-Midland Co', '20170207', 'Connection error.')


 11%|█         | 63/572 [2:01:47<9:20:00, 66.01s/it] 

('Archer-Daniels-Midland Co', '20170502', 'Connection error.')


 11%|█         | 64/572 [2:02:00<7:02:55, 49.95s/it]

('Archer-Daniels-Midland Co', '20170801', 'Connection error.')


 11%|█▏        | 65/572 [2:02:01<4:58:47, 35.36s/it]

('Archer-Daniels-Midland Co', '20171031', 'Connection error.')


 12%|█▏        | 66/572 [2:02:02<3:32:01, 25.14s/it]

('AT&T Inc.', '20170425', 'Connection error.')


 12%|█▏        | 67/572 [2:02:15<2:59:43, 21.35s/it]

('AT&T Inc.', '20171024', 'Connection error.')


 12%|█▏        | 68/572 [2:02:38<3:04:36, 21.98s/it]

('Autodesk Inc.', '20170302', 'Connection error.')


 12%|█▏        | 69/572 [2:02:39<2:12:07, 15.76s/it]

('Automatic Data Processing', '20170201', 'Connection error.')


 12%|█▏        | 70/572 [2:02:41<1:35:38, 11.43s/it]

('Automatic Data Processing', '20170503', 'Connection error.')


 12%|█▏        | 71/572 [2:02:42<1:10:01,  8.39s/it]

('Automatic Data Processing', '20170727', 'Connection error.')


 13%|█▎        | 72/572 [2:02:43<52:10,  6.26s/it]  

('Automatic Data Processing', '20171102', 'Connection error.')


 13%|█▎        | 73/572 [2:02:45<39:41,  4.77s/it]

('Avery Dennison Corp', '20170201', 'Connection error.')


 13%|█▎        | 74/572 [2:02:46<30:53,  3.72s/it]

('Avery Dennison Corp', '20170426', 'Connection error.')


 13%|█▎        | 75/572 [2:02:47<24:56,  3.01s/it]

('Avery Dennison Corp', '20170725', 'Connection error.')


 13%|█▎        | 76/572 [2:02:49<20:58,  2.54s/it]

('Avery Dennison Corp', '20171025', 'Connection error.')


 13%|█▎        | 77/572 [2:02:50<18:13,  2.21s/it]

('Ball Corp', '20170504', 'Connection error.')


 14%|█▎        | 78/572 [2:02:51<15:51,  1.93s/it]

('Ball Corp', '20170803', 'Connection error.')


 14%|█▍        | 79/572 [2:02:53<14:10,  1.72s/it]

('Ball Corp', '20171102', 'Connection error.')


 14%|█▍        | 80/572 [2:02:54<13:11,  1.61s/it]

('Baxter International Inc.', '20170426', 'Connection error.')


 14%|█▍        | 81/572 [2:02:55<12:20,  1.51s/it]

('Baxter International Inc.', '20170726', 'Connection error.')


 14%|█▍        | 82/572 [2:02:57<12:01,  1.47s/it]

('Becton Dickinson', '20170803', 'Connection error.')


 15%|█▍        | 83/572 [2:02:58<11:41,  1.44s/it]

('Becton Dickinson', '20171102', 'Connection error.')


 15%|█▍        | 84/572 [2:02:59<11:32,  1.42s/it]

('Biogen Inc.', '20170126', 'Connection error.')


 15%|█▍        | 85/572 [2:03:01<11:33,  1.42s/it]

('Boeing Company', '20170426', 'Connection error.')


 15%|█▌        | 86/572 [2:03:02<11:04,  1.37s/it]

('Booking Holdings Inc', '20170509', 'Connection error.')


 15%|█▌        | 87/572 [2:03:03<10:40,  1.32s/it]

('Booking Holdings Inc', '20171106', 'Connection error.')


 15%|█▌        | 88/572 [2:03:05<10:43,  1.33s/it]

('BorgWarner', '20170209', 'Connection error.')


 16%|█▌        | 89/572 [2:03:06<10:30,  1.31s/it]

('BorgWarner', '20170727', 'Connection error.')


 16%|█▌        | 90/572 [2:03:07<10:43,  1.33s/it]

('Bristol-Myers Squibb', '20170427', 'Connection error.')


 16%|█▌        | 91/572 [2:03:09<10:46,  1.34s/it]

('Bristol-Myers Squibb', '20170727', 'Connection error.')


 16%|█▌        | 92/572 [2:03:10<10:43,  1.34s/it]

('Broadridge Financial Solutions', '20170510', 'Connection error.')


 16%|█▋        | 93/572 [2:03:11<10:21,  1.30s/it]

('CA, Inc.', '20170802', 'Connection error.')


 16%|█▋        | 94/572 [2:03:13<10:49,  1.36s/it]

('Cadence Design Systems', '20170724', 'Connection error.')


 17%|█▋        | 95/572 [2:03:14<11:01,  1.39s/it]

('Campbell Soup', '20170217', 'Connection error.')


 17%|█▋        | 96/572 [2:03:15<10:45,  1.36s/it]

('Campbell Soup', '20170519', 'Connection error.')


 17%|█▋        | 97/572 [2:03:17<10:23,  1.31s/it]

('Campbell Soup', '20170831', 'Connection error.')


 17%|█▋        | 98/572 [2:03:18<10:23,  1.31s/it]

('Campbell Soup', '20171121', 'Connection error.')


 17%|█▋        | 99/572 [2:03:19<10:36,  1.35s/it]

('Cardinal Health Inc.', '20170501', 'Connection error.')


 17%|█▋        | 100/572 [2:03:21<10:42,  1.36s/it]

('Carmax Inc', '20170922', 'Connection error.')


 18%|█▊        | 101/572 [2:03:22<10:30,  1.34s/it]

('Carmax Inc', '20171221', 'Connection error.')


 18%|█▊        | 102/572 [2:03:23<10:25,  1.33s/it]

('Caterpillar Inc.', '20170126', 'Connection error.')


 18%|█▊        | 103/572 [2:03:25<10:36,  1.36s/it]

('Caterpillar Inc.', '20170425', 'Connection error.')


 18%|█▊        | 104/572 [2:03:26<10:27,  1.34s/it]

('Caterpillar Inc.', '20170725', 'Connection error.')


 18%|█▊        | 105/572 [2:03:27<10:33,  1.36s/it]

('Caterpillar Inc.', '20171024', 'Connection error.')


 19%|█▊        | 106/572 [2:03:29<10:41,  1.38s/it]

('Cboe Global Markets', '20170804', 'Connection error.')


 19%|█▊        | 107/572 [2:03:30<10:33,  1.36s/it]

('CBRE Group', '20171103', 'Connection error.')


 19%|█▉        | 108/572 [2:03:31<10:25,  1.35s/it]

('CBS Corp.', '20170807', 'Connection error.')


 19%|█▉        | 109/572 [2:03:33<10:21,  1.34s/it]

('CBS Corp.', '20171102', 'Connection error.')


 19%|█▉        | 110/572 [2:03:34<10:12,  1.33s/it]

('Celgene Corp.', '20170126', 'Connection error.')


 19%|█▉        | 111/572 [2:03:36<10:29,  1.37s/it]

('Celgene Corp.', '20170427', 'Connection error.')


 20%|█▉        | 112/572 [2:03:37<10:20,  1.35s/it]

('Celgene Corp.', '20170727', 'Connection error.')


 20%|█▉        | 113/572 [2:03:38<10:17,  1.34s/it]

('Celgene Corp.', '20171026', 'Connection error.')


 20%|█▉        | 114/572 [2:03:40<10:23,  1.36s/it]

('CenturyLink Inc', '20170802', 'Connection error.')


 20%|██        | 115/572 [2:03:41<10:18,  1.35s/it]

('CenturyLink Inc', '20171108', 'Connection error.')


 20%|██        | 116/572 [2:03:42<10:23,  1.37s/it]

('Chevron Corp.', '20170127', 'Connection error.')


 20%|██        | 117/572 [2:03:44<10:26,  1.38s/it]

('Chevron Corp.', '20170428', 'Connection error.')


 21%|██        | 118/572 [2:03:45<10:03,  1.33s/it]

('Chevron Corp.', '20170728', 'Connection error.')


 21%|██        | 119/572 [2:03:46<10:20,  1.37s/it]

('Church & Dwight', '20170803', 'Connection error.')


 21%|██        | 120/572 [2:03:48<10:21,  1.37s/it]

('Church & Dwight', '20171102', 'Connection error.')


 21%|██        | 121/572 [2:03:49<10:34,  1.41s/it]

('CIGNA Corp.', '20170202', 'Connection error.')


 21%|██▏       | 122/572 [2:03:51<10:32,  1.41s/it]

('CIGNA Corp.', '20170505', 'Connection error.')


 22%|██▏       | 123/572 [2:03:52<10:18,  1.38s/it]

('CIGNA Corp.', '20170804', 'Connection error.')


 31%|███       | 178/572 [17:30:21<38:44:56, 354.05s/it]  


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17105 entries, 0 to 17104
Columns: 13 entries, Positive Outlook to Sentence_num
dtypes: object(13)
memory usage: 1.7+ MB


In [8]:
OpenAI_sentiment.describe()

Unnamed: 0,Positive Outlook,Negative Outlook,Neutral/Factual,Cautiously Optimistic,Concerned/Uncertain,Strong/Confident,Weak/Insecure,Growth-Oriented,Cost-Conscious,Risk-Acknowledging,Company,Date,Sentence_num
count,17105,17105,17105,17105,17105,17105,17105,17105,17105,17105,17105,17105,17105
unique,2,2,2,2,2,2,2,2,2,2,61,60,402
top,0,0,1,0,0,0,0,0,0,0,AmerisourceBergen Corp,20170427,1
freq,10735,15543,10816,15873,14237,13771,15680,11181,16431,14785,801,967,116


In [9]:
print(len(sentiment_errors))
sentiment_errors

64


[('Apache Corporation', '20170223', 'Connection error.'),
 ('Apache Corporation', '20171102', 'Connection error.'),
 ('Archer-Daniels-Midland Co', '20170207', 'Connection error.'),
 ('Archer-Daniels-Midland Co', '20170502', 'Connection error.'),
 ('Archer-Daniels-Midland Co', '20170801', 'Connection error.'),
 ('Archer-Daniels-Midland Co', '20171031', 'Connection error.'),
 ('AT&T Inc.', '20170425', 'Connection error.'),
 ('AT&T Inc.', '20171024', 'Connection error.'),
 ('Autodesk Inc.', '20170302', 'Connection error.'),
 ('Automatic Data Processing', '20170201', 'Connection error.'),
 ('Automatic Data Processing', '20170503', 'Connection error.'),
 ('Automatic Data Processing', '20170727', 'Connection error.'),
 ('Automatic Data Processing', '20171102', 'Connection error.'),
 ('Avery Dennison Corp', '20170201', 'Connection error.'),
 ('Avery Dennison Corp', '20170426', 'Connection error.'),
 ('Avery Dennison Corp', '20170725', 'Connection error.'),
 ('Avery Dennison Corp', '20171025',

In [10]:
###############################################
OpenAI_sentiment.to_csv(f'data/data_prep/OpenAI_sentiment.csv', index=False)
###############################################

In [11]:
sentiment_errors = pd.DataFrame(sentiment_errors)
###############################################
sentiment_errors.to_csv(f'data/data_prep/sentiment_errors.csv', index=False)
###############################################

In [None]:
stop

In [None]:
MAEC_OpenAI_sentiment = []
MAEC_sentiment_errors = []
for Ticker, Date in tqdm(MAEC_filename_data[['Ticker', 'Date']].values):
    Date = Date.replace('-', '')
    text_path = f"D:/MAEC_audio/{Date}_{Ticker}/text.txt"
    try:
        with open(text_path, 'r', encoding='utf-8', errors='replace') as file:
            for i, line in enumerate(file, start=1):
                one_hot_encoding = get_sentiment(line.strip())
                features_row = np.concatenate([one_hot_encoding, [Ticker, Date, i]])
                MAEC_OpenAI_sentiment.append(features_row)
    except KeyboardInterrupt: break
    except Exception as e:
        print((Ticker, Date, str(e)))
        MAEC_sentiment_errors.append((Ticker, Date, str(e)))
MAEC_OpenAI_sentiment = np.array(MAEC_OpenAI_sentiment, dtype=object)
MAEC_OpenAI_sentiment = pd.DataFrame(MAEC_OpenAI_sentiment, columns= sentiment_list + ['Ticker', 'Date', 'Sentence_num'])
MAEC_OpenAI_sentiment.info(verbose=False)
###############################################
MAEC_OpenAI_sentiment.to_csv(f'data/data_prep/MAEC_OpenAI_sentiment.csv', index=False)
###############################################

In [None]:
MAEC_OpenAI_sentiment.describe()

In [None]:
print(len(MAEC_sentiment_errors))
MAEC_sentiment_errors

In [None]:
stop

In [None]:
['Positive Outlook', 'Negative Outlook', 'Neutral/Factual', 'Cautiously Optimistic', 
 'Concerned/Uncertain', 'Strong/Confident', 'Weak/Insecure', 'Growth-Oriented', 
 'Cost-Conscious', 'Risk-Acknowledging']

['Positive Outlook', 'Negative Outlook', 'Neutral/Factual', 'Cautiously Optimistic', 
 'Concerned/Uncertain', 'Strong/Confident', 'Weak/Uncertain', 'Growth-Oriented', 
 'Cost-Conscious', 'Risk-Acknowledging']

In [None]:
def apply_model(model, model_name, num_features ):
    print(f'Applying {model_name} to the original dataset …')
    # num_features
    columns = [f'{model_name}_{j}' for j in range(num_features)] + ['Company', 'Date', 'Sentence_num']
    features = []
    errors = []
    for Company, Date in tqdm(filename_data[['Company', 'Date']].values):
        Date = Date.replace('-', '')
        text_path = f"D:/original_dataset/{Company}_{Date}/TextSequence.txt"
        try:
            with open(text_path, 'r', encoding='utf-8', errors='replace') as file:
                for i, line in enumerate(file, start=1):
                    # apply model
                    sentence_embedding = model(line.strip())
                    features_row = np.concatenate([sentence_embedding.flatten(), [Company, Date, i]])
                    features.append(features_row)
        except KeyboardInterrupt: break
        except Exception as e:
            errors.append((Company, Date, str(e)))
    features = np.array(features, dtype=object)
    features = pd.DataFrame(features, columns=columns)
    features.info(verbose=False)
    
    print(f"Number of errors: {len(errors)}")
    print(errors)
    ###############################################
    features.to_csv(f'data/data_prep/{model_name}.csv', index=False)
    ###############################################

def apply_model_MAEC(model, model_name, num_features):
    print(f'Applying {model_name} to the MAEC dataset …')
    # num_features
    columns = [f'{model_name}_{j}' for j in range(num_features)] + ['Ticker', 'Date', 'Sentence_num']
    features = []
    errors = []
    for Ticker, Date in tqdm(MAEC_filename_data[['Ticker', 'Date']].values):
        Date = Date.replace('-', '')
        text_path = f"D:/MAEC_audio/{Date}_{Ticker}/text.txt"
        try:
            with open(text_path, 'r', encoding='utf-8', errors='replace') as file:
                for i, line in enumerate(file, start=1):
                    # apply model
                    sentence_embedding = model(line.strip())
                    features_row = np.concatenate([sentence_embedding.flatten(), [Ticker, Date, i]])
                    features.append(features_row)
        except KeyboardInterrupt: break
        except Exception as e:
            errors.append((Ticker, Date, str(e)))
    features = np.array(features, dtype=object)
    features = pd.DataFrame(features, columns=columns)
    features.info(verbose=False)
    
    print(f"Number of errors: {len(errors)}")
    print(errors)
    ###############################################
    features.to_csv(f'data/data_prep/MAEC_{model_name}.csv', index=False)
    ###############################################
