In [1]:
import tiktoken
import numpy as np
import pandas as pd
import math
import my_secrets
import openai
import requests
import time
from tenacity import retry, wait_random_exponential, stop_after_attempt

# model used to generate responses and dictate tokenization
model = 'gpt-3.5-turbo' # for better performance at a higher price point use: 'gpt-4'
openai.api_key = my_secrets.OPENAI_API_KEY

https://platform.openai.com/playground?mode=chat

In [2]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    try:
        encoding = tiktoken.get_encoding(encoding_name)
    except ValueError:
        encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def clean_text(text: str) -> str:
    """Cleans up earnings transcript formatting"""
    return ' '.join(text.splitlines()).strip() # cleaning up formatting
    

def set_user_content(company_name: str, quarter: int, year: int, transcript: str) -> str:
    """Sets the task for the LLM and delivers the transcript"""
    # set the initial context
    context = f'''Your job is to return a single number between 1 and 20 for the quarter {quarter} {year} earnings release by {company_name}. 
    A score of 1 means the text had extremely poor sentiment, while a score of 20 means the text had extremely positive sentiment.  
    Please only return a single number. No additional response is required.
    Here is the transcript: """{transcript}"""
    '''

    user_content = {"role": "user", "content": clean_text(context)}

    return user_content

def generate_sub_content(transcript, model):
    """Breaks up a transcript into individual parts to fit the context window of the LLM."""
    sub_contexts = []
    n = num_tokens_from_string(transcript, model) + 1000 # space for setting the context and metadata
    # if it is more than 16k...
    parts = math.ceil(n / 16000) + 1 # calculate the number of times we'll break up the text

    # determine breakpoints
    breakpoint = math.ceil( n / parts )
    token_idx = [(p+1) * breakpoint for p in range((parts-1))] # translate breakpoint to index
    l_sentences = transcript.split('. ') # ensure we end the sub-transcript with the end of a sentence.

    # generate 
    l_idx = []

    for t in range(len(token_idx) + 1):
      n_sentences = np.array([num_tokens_from_string(s, model) for s in l_sentences])
      sum_sentences = n_sentences.cumsum()

      if t < len(token_idx):
        last_valid = sum_sentences[sum_sentences < token_idx[t]][-1]
      else:
        last_valid = sum_sentences[-1]
      last_idx = np.where(sum_sentences == last_valid)[0][0]
      l_idx.append(last_idx)

      if t > 0:
        first_idx = l_idx[t-1]
      else:
        first_idx = 0

      sub_context = '. '.join(l_sentences[first_idx:last_idx+1])
      sub_contexts.append(sub_context)
      
    return sub_contexts

calibration_init = "Your job is to return a single number between 1 and 20 for the provided text. A score of 1 means the text had extremely poor sentiment, while a score of 20 means the text had extremely positive sentiment. Please only return a single number. No additional response is required. Here is the text to analyze:"

def initialize_message(system_content):
  """We will need to clear out the message box every time we hit the API"""
  messages=[
    {"role": "system", "content": system_content},
    {"role": "user", "content": calibration_init + f'"""{poor_sentiment2}"""' },
    {"role": "assistant", "content": "2"},
    {"role": "user", "content": "That is a good response. I have another one for you to analyze." + calibration_init + f'"""{positive_sentiment}"""' },
    {"role": "assistant", "content": "19"},
    {"role": "user", "content": "That is a good response. I have another one for you to analyze."},
    {"role": "assistant", "content": "Thank you. Ready when you are."},
  ]
  return messages

@retry(wait=wait_random_exponential(multiplier=1, max=40), stop=stop_after_attempt(3))
def chat_completion_request(messages, model):
    headers = {
        "Content-Type": "application/json",
        "Authorization": "Bearer " + openai.api_key,
    }
    json_data = {
       "model": model + '-16k', 
       "messages": messages,
    }

    try:
        response = requests.post(
            "https://api.openai.com/v1/chat/completions",
            headers=headers,
            json=json_data,
        )
    except Exception as e:
        print("Unable to generate ChatCompletion response")
        print(f"Exception: {e}")
    
    return response

# a few global variables

# redefining the role of the LLM. Default is helpful assistant. We want to turn that into "stock research analyst"
# this message appears at the beginning of every request
system_content = clean_text(
    '''You are a sentiment analysis machine used to determine the sentiment of summaries of companies' earnings call transcripts.
    You will receive a summary text. You will return a single number between 1 and 20, with 1 being extremely poor sentiment and 20 being euphoric sentiment, describing the text.  
    The text to be analyzed is delimited by triple quotes.
    '''
)


# calibration

## poor sentiment
poor_sentiment = clean_text('''
In this earnings call transcript, several key points regarding the company's financial performance and strategies are highlighted:
- The company starts by announcing the withdrawal of previously communicated financial guidance, likely due to the uncertainty caused by the events in March.
- The management emphasizes their commitment to serving clients and communities, even during challenging times. 
- The company experienced significant deposit outflows in March but notes that deposits stabilized by the end of March. 
- The company's capital position is described as well-capitalized, with specific Tier 1 leverage, common equity Tier 1, and total risk-based capital ratios. 
- The company mentions the suspension of dividends on common and preferred stock for prudent capital management.
- The company reports declines in sales and profitability from customer churn. 
- The company outlines several strategies to strengthen its business, including a workforce reduction and executive compensation reduction while maintaining a commitment to client service.
Overall, the transcript indicates a challenging period marked by deposit outflows, the need for additional liquidity, and a focus on stabilizing the company's financial position. The suspension of dividends and workforce reduction reflect measures taken to address these challenges while maintaining a commitment to clients and communities.
''')

poor_sentiment2 = clean_text(
'''
In this earnings call transcript, several key points regarding the company's financial performance and strategies are highlighted:
- The management emphasizes their commitment to serving clients and communities, even during challenging times. 
- The company experienced modest growth and believes they will still fall within guidance for the full year. 
- The company's capital position is described as well-capitalized.
- The company reports little growth in earnings.
- The company outlines several strategies to strengthen its business, including controlling expenses and being more prudent investors of its capital.
Overall, the transcript indicates a challenging period marked by the need for additional liquidity, and a focus on stabilizing the company's financial position. 
The suspension of dividends and workforce reduction reflect measures taken to address these challenges while maintaining a commitment to clients and communities.
'''
)

## great sentiment
positive_sentiment = clean_text('''
In the earnings call transcript, the following key points are highlighted:
- The company beat expectations and raised guidance for the next quarter and remainder of the year.
- Total ARR (Annual Recurring Revenue) growth for the company was 47% in the third quarter, with organic ARR growth of 37%.
- Organic ARR growth for the company in Q3 was 37%, demonstrating consistent growth.
- The company achieved strong new logo acquisition and reached over 57,000 total active customers by the end of Q3.
- Growth was seen in all major geographies and across all of the company's top 10 commercial industries.
- Commercial markets made up nearly 75% of total bookings.
- The company attributes its growth to the expansion of devices in global enterprises.
- The company's platform capabilities are broadening to address IT and security challenges.
- The company ended the quarter with 25 million devices on its platform, representing 34% year-over-year growth.

In summary, the company reported strong growth in ARR, revenue, and the number of devices on its platform. 
The company's performance was consistent across various geographies and industries. 
''')

`EarningsTranscriptData.txt` is a text file, delimited by commas with the following columns: `symbol`, `year`, `quarter`, `chunk`, `response`.
If you run `get_gpt_summaries.ipynb`, it generates the data in appropriate format. See that notebook for details on each column. 


In [3]:
df = pd.read_csv('gpt_summaries.csv').groupby(['symbol', 'year', 'quarter'])['response'].apply(lambda x: ','.join(x)).reset_index()
xref = pd.read_csv('company_xref.csv')
df = pd.merge(df, xref, how='left', on='symbol').rename({'registrantName':'company_name'}, axis=1)


In [None]:
# initialize empty list to store responses
sentiment_responses = [None] * df.shape[0]

for i in range(len(sentiment_responses)):
  ticker = df.loc[i,'symbol']
  quarter = df.loc[i,'quarter']
  year = df.loc[i,'year']
  transcript = df.loc[i,'response']
  if len(transcript) < 1:
    continue
  company_name = xref[xref.symbol == ticker].registrantName.values[0]
  
  # initialize message dict with `system`
  messages = initialize_message(system_content=system_content)
  # update messages with user context and transcript
  messages.append(set_user_content(company_name, quarter, year, transcript))
  # hit api with messages, get response
  json_resp = chat_completion_request(messages=messages, model=model)
  try:
    resp = dict(json_resp.json())['choices'][0]['message']['content'] # try to extract content
  except KeyError:
    print(f'WARNING: {ticker} {quarter}Q{year} skipped due to bad response.')
    continue # move on, try again later
  except requests.JSONDecodeError:
    print(f'WARNING: {ticker} {quarter}Q{year} skipped due to bad response.')
    continue # move on, try again later
  else:
    resp = dict(json_resp.json())['choices'][0]['message']['content'] # we just want the content
  # parse response and store with data in dataframe
  sentiment_responses[i] = resp
  df['sentiment_score'] = sentiment_responses
  df.to_csv('gpt_sentiment2.csv', index=False)
  time.sleep(5) # just so we don't murder the api
  print(f'Completed {ticker} {quarter}Q{year}.')
      