# Import libraries

In [1]:
# Importing required modules
import pandas as pd
from openai import OpenAI
from tenacity import retry, stop_after_attempt, wait_random_exponential
import io
import os
import json
import asyncio
from openai import AsyncOpenAI
import time

In [2]:
from dotenv import load_dotenv
# Load the .env file
load_dotenv()

True

In [3]:
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s',
                        handlers=[
        logging.FileHandler("../data/find_promises_log_20240101.log"),  # Log messages are written to this file
        logging.StreamHandler()  # Log messages are also printed to the console
    ]
)

In [4]:
import nest_asyncio
nest_asyncio.apply()

# Prepare the database

In [55]:
promises = pd.read_csv("../data/sxp1500_presentations_ceo_aggregated_promises_expanded_cleaned_transcriptlevel.csv")

  promises = pd.read_csv("../data/LIWC-22 Results - sxp1500_presentations_ceo_aggr___ - LIWC Analysis_v11.csv")


In [56]:
promises['processed'] = 0
promises['result'] = None

## Setting the parameters

In [57]:
# Adjust these values based on your needs
CONCURRENT_REQUESTS = 250  # Maximum number of concurrent requests
TIMEOUT = 240  # Timeout in seconds for each request

# Constants
batch_size = 1000  # Number of requests after which to checkpoint
save_file = '../data/sxp1500_presentations_ceo_aggregated_promises_expanded_cleaned_transcriptlevel_horizon.pkl'

# for sample run
#initial_file = '../data/sample_transcripts.pkl'

open_ai_api_key="..."

## Batch promise identification functions

In [58]:
user_message_template = """
I am analyzing CEO promises from earnings conference calls. For each call, I have the following inputs:
• Conference date (in yyyy-mm-dd format)
• Stated delivery time or date of the promise

From these inputs, I want you to determine the number of months between the conference date and the stated delivery time. If the delivery date is not specified or is too ambiguous to deduce a clear timeframe, return “unclear.”

For delivery date ranges (e.g., “within the next few months” or “over the next few quarters”), return the maximum of that range. For example:
• “A few months” might be 2–9 months, so return 9.
• “Between now and the next three years” might be 0–36 months, so return 36.

Your output must be strictly one of the following formats:
“unclear”
Or
"An integer representing the months to delivery"ArithmeticError

Absolutely no other text or characters should be returned. Don't explain how you arrived at the answer. Just return the answer.



Conference date (yyyy-mm-dd): {placeholder_conference_date}
Delivery time: {placeholder_delivery_date}
"""


# CORRECTED:
def create_user_message(row, user_message_template):
    # Ensure data is string type for replacement, handle potential NaNs
    delivery_time = str(row["3-promise-delivery-time"]) if pd.notna(row["3-promise-delivery-time"]) else ""
    conference_date = str(row["mostimportantdateutc"]) if pd.notna(row["mostimportantdateutc"]) else ""

    user_message = user_message_template.replace("{placeholder_conference_date}", conference_date) \
        .replace("{placeholder_delivery_date}", delivery_time)
    return user_message

In [59]:

client = AsyncOpenAI(api_key=open_ai_api_key)

# Semaphore to control the number of concurrent requests
semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS)


In [60]:

@retry(stop=stop_after_attempt(3), wait=wait_random_exponential(multiplier=1, max=10))
async def fetch_chat_completion(user_message, index):
    async with semaphore:
        logging.info(f"Processing request for row {index}")
        try:
            response = await asyncio.wait_for(client.chat.completions.create(
                model="o3-mini-2025-01-31",
                reasoning_effort="medium",
                seed=2025,
                messages=[{"role": "user", "content": user_message}]
            ), TIMEOUT)
            logging.info(f"Completed request for row {index}")
            return response
        except Exception as e:
            logging.error(f"Error in request for row {index}: {e}")
            return None

async def process_rows(df, BATCH_SIZE, save_file):
    df_non_processed = df[df['processed'] == 0]

    total_prompt_tokens = 0
    total_completion_tokens = 0

    for start_index in range(0, len(df_non_processed), BATCH_SIZE):
        end_index = start_index + BATCH_SIZE
        batch_df = df_non_processed.iloc[start_index:end_index]
        
        task_list = [
            (index, asyncio.create_task(fetch_chat_completion(create_user_message(row, user_message_template), index)))
            for index, row in batch_df.iterrows()
        ]
        responses = await asyncio.gather(*[task for index, task in task_list], return_exceptions=True)
        
        for (index, _), response in zip(task_list, responses):
            if isinstance(response, Exception):
                df.at[index, 'result'] = None
            elif response is None or pd.isna(response.choices[0].message.content) or response.choices[0].message.content == '' or response.choices[0].message.content == 'None':
                df.at[index, 'result'] = None

                try:
                    total_prompt_tokens += response.usage.prompt_tokens
                except:
                    pass
            else:
                df.at[index, 'result'] = response.choices[0].message.content
                df.at[index, 'processed'] = 1

                try:
                    total_prompt_tokens += response.usage.prompt_tokens
                    total_completion_tokens += response.usage.completion_tokens
                except:
                    pass

        # Save intermediate results
        df.to_pickle(save_file)
        

    # Account for responses that are None, NaN, or empty
    df['result'] = df['result'].replace('', None).replace(float('nan'), None)
    df.to_pickle(save_file)
    return df, total_prompt_tokens, total_completion_tokens


# Running the asynchronous tasks with checkpointing
async def main(batch_size, save_file):
    # check if save file exists
    
    # Process remaining rows
    results_df, total_prompt_tokens, total_completion_tokens = await process_rows(promises, batch_size, save_file)
    return results_df, total_prompt_tokens, total_completion_tokens



In [None]:
# Execute the main function in Jupyter notebook
results, total_prompt_tokens, total_completion_tokens = await main(batch_size, save_file)

In [None]:
list(results['result'])



In [63]:
# Calculate percentage of promise horizons that are "unclear"
unclear_count = len(results[results['result'] == 'unclear'])
total_count = len(results)
unclear_percentage = unclear_count / total_count if total_count > 0 else 0
print(f"Percentage of unclear promise horizons: {unclear_percentage:.2%}")

Percentage of unclear promise horizons: 46.72%


In [64]:
test = promises.head(1000)

In [None]:
# rename column results to 3-promise-horizon-v2
results.rename(columns={'result': '3-promise-horizon-v2'}, inplace=True)
results.to_csv("../data/sxp1500_presentations_ceo_aggregated_promises_expanded_cleaned_transcriptlevel_horizon.csv", index=False)