In [1]:
import tiktoken
import numpy as np
import pandas as pd
import math
import my_secrets
import openai
import requests
import time
from tenacity import retry, wait_random_exponential, stop_after_attempt

# model used to generate responses and dictate tokenization
model = 'gpt-3.5-turbo' # for better performance at a higher price point use: 'gpt-4'

openai.api_key = my_secrets.OPENAI_API_KEY

https://platform.openai.com/playground?mode=chat

In [2]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    try:
        encoding = tiktoken.get_encoding(encoding_name)
    except ValueError:
        encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def clean_text(text: str) -> str:
    """Cleans up earnings transcript formatting"""
    return ' '.join(text.splitlines()).strip() # cleaning up formatting
    

def set_user_content(company_name: str, quarter: int, year: int, transcript: str) -> str:
    """Sets the task for the LLM and delivers the transcript"""
    # set the initial context
    context = f'''Your job is to summarize the following transcript for the quarter {quarter} {year} earnings release by {company_name}. 
    Pay specific attention to discussed items that impact sales, margins and earnings. 
    Identify what the sell side analysts focused on the most in the Q&A portion, if there is one.
    Here is the transcript: """{transcript}"""
    '''

    user_content = {"role": "user", "content": clean_text(context)}

    return user_content

def generate_sub_content(transcript, model):
    """Breaks up a transcript into individual parts to fit the context window of the LLM."""
    sub_contexts = []
    n = num_tokens_from_string(transcript, model) + 1000 # space for setting the context and metadata
    # if it is more than 16k...
    parts = math.ceil(n / 16000) + 1 # calculate the number of times we'll break up the text

    # determine breakpoints
    breakpoint = math.ceil( n / parts )
    token_idx = [(p+1) * breakpoint for p in range((parts-1))] # translate breakpoint to index
    l_sentences = transcript.split('. ') # ensure we end the sub-transcript with the end of a sentence.

    # generate 
    l_idx = []

    for t in range(len(token_idx) + 1):
      n_sentences = np.array([num_tokens_from_string(s, model) for s in l_sentences])
      sum_sentences = n_sentences.cumsum()

      if t < len(token_idx):
        last_valid = sum_sentences[sum_sentences < token_idx[t]][-1]
      else:
        last_valid = sum_sentences[-1]
      last_idx = np.where(sum_sentences == last_valid)[0][0]
      l_idx.append(last_idx)

      if t > 0:
        first_idx = l_idx[t-1]
      else:
        first_idx = 0

      sub_context = '. '.join(l_sentences[first_idx:last_idx+1])
      sub_contexts.append(sub_context)
      
    return sub_contexts

def initialize_message(system_content):
  """We will need to clear out the message box every time we hit the API"""
  messages=[
    {"role": "system", "content": system_content}
  ]
  return messages

# a few global variables

# redefining the role of the LLM. Default is helpful assistant. We want to turn that into "stock research analyst"
# this message appears at the beginning of every request
system_content = clean_text(
    '''You are an intelligent stock research analyst who gives opinions on the performance of public companies in the United States.   
    You are particularly good at distilling information from earnings transcripts to the most relevant pieces. 
    Earnings transcripts generally have two sections. One section entails prepared remarks by the company's management team, which covers the last quarter or year's performance, and usually includes guidance for the upcoming quarter or remainder of the year.
    The second section is a question and answer session between the company's management team and Wall Street sell-side analysts. The Q&A segment has more dialogue.
    You will be provided part of the transcript which may include one or both sections, delimited by triple quotes.  Only use the data from the transcript provided to briefly summarize the content.
    '''
)


@retry(wait=wait_random_exponential(multiplier=1, max=40), stop=stop_after_attempt(3))
def chat_completion_request(messages, model):
    headers = {
        "Content-Type": "application/json",
        "Authorization": "Bearer " + openai.api_key,
    }
    json_data = {
       "model": model + '-16k', 
       "messages": messages,

       }

    try:
        response = requests.post(
            "https://api.openai.com/v1/chat/completions",
            headers=headers,
            json=json_data,
        )
    except Exception as e:
        print("Unable to generate ChatCompletion response")
        print(f"Exception: {e}")
    
    return response


`EarningsTranscriptData.txt` is a text file, delimited by commas with the following columns: `symbol`, `quarter`, `year`, `date`, `content`.
If you run `earnings_extraction.py`, it generates the data in appropriate format. 

In [3]:
df = pd.read_csv('EarningsTranscriptData.txt')
xref = pd.read_csv('company_xref.csv')

df = pd.merge(df, xref, how='left', on='symbol').rename({'registrantName':'company_name'}, axis=1)

In [None]:
#fiscal years and quarters to extract
quarters = [1,2,3,4]
years = [2018, 2019, 2020, 2021, 2022]
holdings = ['WMT', 'EQIX', 'CMG', 'MCHP', 'VTRS', 'RMD', 'PEG', 'PEP', 'CI',
            'HON', 'BALL', 'CPB', 'MRO', 'NVDA', 'PARA', 'MTCH', 'ETSY', 'EMN',
            'WBD', 'CINF', 'LDOS', 'CE', 'SBAC', 'NOW', 'MDLZ']

completed = pd.read_csv('gpt_summaries.csv')

all_data = []
for ticker in holdings:
  for year in years:
    for quarter in quarters:

      # pull transcript from library
      _transcript = df[(df.symbol == ticker) & (df.quarter == quarter) & (df.year == year)]['content'].values
      # not all companies in sample report in all years for all quarters; loop continues if no transcript found
      if len(_transcript) < 1:
        continue
      else:
        transcript = _transcript[0]

      company_name = xref[xref.symbol == ticker].registrantName.values[0]
      # break up transcript into parts
      l_sc = generate_sub_content(transcript, model)
      #print(f'{ticker} {quarter}Q{year} has {len(l_sc)} chunks.')

      # check to see if already completed...
      n_complete = completed[(completed.symbol == ticker) & (completed.quarter == quarter) & (completed.year == year)].shape[0]
      if n_complete >= len(l_sc):
        print(f'Skipping {ticker} {quarter}Q{year}.')
        continue
      elif n_complete == 0:
        print(f'Starting {ticker} {quarter}Q{year}')
      else:
        l_sc[n_complete:] # only run the chunks that have not been saved
        print(f'Partial detected. Starting with chunk {n_complete} out of {len(l_sc)}.')
      for sc in range(len(l_sc)):
        # initialize message dict with `system`
        messages = initialize_message(system_content=system_content)
        # update messages with user context and transcript
        messages.append(set_user_content(company_name, quarter, year, transcript))
        # hit api with messages, get response
        json_resp = chat_completion_request(messages=messages, model=model)
        try:
          resp = dict(json_resp.json())['choices'][0]['message']['content'] # try to extract content
        except KeyError:
          print(f'WARNING: {ticker} {quarter}Q{year} skipped due to bad response.')
          continue # move on, try again later
        except requests.JSONDecodeError:
          print(f'WARNING: {ticker} {quarter}Q{year} skipped due to bad response.')
          continue # move on, try again later
        else:
          resp = dict(json_resp.json())['choices'][0]['message']['content'] # we just want the content
        # parse response and store with data in dataframe
        _df = pd.DataFrame.from_dict({'symbol':ticker, 'year': year, 'quarter': quarter, 'chunk': sc, 'response': resp}, orient='index').T
        all_data.append(_df)
        time.sleep(10) # just so we don't murder the api
      print(f'Completed {ticker} {quarter}Q{year}.')
      

Adds completed responses to csv

In [9]:
pd.concat([completed, pd.concat(all_data)]).to_csv('gpt_summaries.csv', index=False)

In [10]:

completed = pd.read_csv('gpt_summaries.csv')
completed

Unnamed: 0,symbol,year,quarter,chunk,response
0,WMT,2018,1,0,"In the Q1 2018 earnings release, Walmart repor..."
1,WMT,2018,1,1,In the transcript of Walmart's Q1 2018 earning...
2,WMT,2018,2,0,Walmart reported solid results for the second ...
3,WMT,2018,2,1,"During the second quarter of fiscal 2018, Walm..."
4,WMT,2018,3,0,In the prepared remarks section of the transcr...
...,...,...,...,...,...
959,MDLZ,2022,2,1,Mondelez International reported a strong secon...
960,MDLZ,2022,3,0,Mondelez International reported a strong quart...
961,MDLZ,2022,3,1,"Mondelez International, Inc. reported strong p..."
962,MDLZ,2022,4,0,Mondelez International had a strong performanc...


What you do with the above sub-summaries is up to you.  You could group by ticker, year, and quarter and resend to the LLM with a prompt that asks it to combine the summaries in a way that keeps all unique information. 