In [1]:
import numpy as np
import openai
import pandas as pd
import os
import anthropic
import re
import time


In [7]:
transcripts_promises = pd.read_csv('../data/sxp1500_presentations_ceo_aggregated_promises_expanded_cleaned.csv')

In [129]:
len(transcripts_promises)

74017

In [3]:
system_message = """
You are a research assistant helping to categorize CEO promises into thematic clusters.
Your task is to read each promise and return up to three keywords (or short phrases) that capture the promise’s main topics. Essentially, we are trying to capture what is being promised, in the broadest terms.

Example keywords:
Product Release, Acquisition, Increased Profits, Cost Reduction, Market Expansion, Merger, Dividend Increase, Stock Buyback, R&D Investment, New Facilities, Innovation Pipeline, Brand Extension, Market Share Growth, Digital Transformation, Sustainability Initiative, Debt Reduction, ESG Commitment, Regulatory Compliance, Cultural Transformation, Quality Enhancement, Organizational Realignment, Marketing Campaign, Profit Margin Improvement, International Partnership, AI Implementation, Data Security Upgrade, Corporate Citizenship, New Partnerships, Customer Retention, Customer Experience, Product Launch, Supply Chain Improvement, International Partnership, AI Implementation, Data Security Upgrade, Corporate Citizenship, Pricing Strategy, Efficiency Improvement, Workforce Expansion, Vertical Integration, Horizontal Integration, Employee Training, Technology Upgrade, Customer Experience Improvement, Product Diversification, Risk Management, Inventory Management, Operational Efficiency, Talent Acquisition, New Distribution Channels, Strategic Partnerships

Note that these are just examples. You are not limited to these keywords. The point is that you should return three keywords that capture the promise's main topics.


Output Format Requirements:
Return only keywords or short noun-phrases—avoid extra words like “and,” “about,” “related,” etc.
Separate the keywords with semicolons and no additional text.

Do not provide any explanations, disclaimers, or commentary—just the keywords.

If there are fewer than 3 meaningful, distinct keywords, return only the ones that apply.

If more than 3 keywords might apply, prioritize the three most central to the promise content.
"""

In [4]:
user_prompt_template = """
I am conducting research on CEO promises in earnings calls.

Below, I will provide:
The verbatim promise text.
A short explanation of the promise’s context.

Your job: Identify and output up to three keywords or short phrases that best describe the main focus or content of the promise.

Important:
Only return a single line in the format:
keyword1; keyword2; keyword3
Nothing else—no extra words or formatting.

Promise Verbatim:
{{{promise_verbatim}}}

Promise Explanation:
{{{promise_explanation}}}
"""

In [5]:
def create_user_prompt(promise_verbatim, promise_explain, user_prompt_template):
    return user_prompt_template.format(promise_verbatim=promise_verbatim, promise_explanation=promise_explain)
    
            

In [6]:

claude_api_key = "..."
client_claude = anthropic.Anthropic(api_key=claude_api_key)

In [7]:
def get_keywords(promise_verbatim, promise_explain):
    user_prompt = create_user_prompt(promise_verbatim, promise_explain, user_prompt_template)

    message = client_claude.messages.create(
        model="claude-3-7-sonnet-20250219",
        max_tokens=9551,
        temperature=0,
        system=system_message,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": user_prompt                }
                ]
            }
        ]
    )

    return message.content[0].text

In [8]:
# test on a single promise
promise_verbatim = transcripts_promises.iloc[110]['promise_verbatim']
promise_explain = transcripts_promises.iloc[110]['promise_explain']

keywords = get_keywords(promise_verbatim, promise_explain)
print(keywords)

Strategic Selectivity; Opportunity Assessment; Business Monitoring


In [15]:
transcripts_promises.columns

Index(['transcriptid', 'companyname', 'gvkey', 'transcript_date',
       'speaker_name', 'presentation_len', 'year', 'full_transcript_len',
       'promise_verbatim', 'promise_explain', 'promise_id',
       'promise_horizon_months'],
      dtype='object')

In [None]:
# run this on all promises; save the output in a dictionary with "promise_id" as the key and "keywords" as the value
promises_keywords = {}
counter = 0
for i, row in transcripts_promises.iterrows():
    promise_id = row['promise_id']
    promise_verbatim = row['promise_verbatim']
    promise_explain = row['promise_explain']
    keywords = get_keywords(promise_verbatim, promise_explain)
    promises_keywords[promise_id] = keywords
    print(f"Processed {counter} promises")
    counter += 1


# save the output in a csv file
promises_keywords_df = pd.DataFrame(list(promises_keywords.items()), columns=['promise_id', 'keywords'])
promises_keywords_df.to_csv('promises_keywords.csv', index=False)


In [9]:
import anthropic
from anthropic.types.message_create_params import MessageCreateParamsNonStreaming
from anthropic.types.messages.batch_create_params import Request

In [18]:
# Prepare the batch requests
def create_batch_requests(df, system_message, user_prompt_template):
    requests = []
    id_mapping = {}  # To keep track of original IDs mapped to sanitized IDs
    
    for _, row in df.iterrows():
        original_promise_id = str(row['promise_id'])
        
        # Sanitize the ID to match the required pattern: letters, numbers, underscores, hyphens only
        # Replace periods with underscores
        sanitized_id = original_promise_id.replace('.', '_')
        
        # Remove any other invalid characters
        sanitized_id = re.sub(r'[^a-zA-Z0-9_-]', '_', sanitized_id)
        
        # Ensure it's not longer than 64 characters
        if len(sanitized_id) > 64:
            sanitized_id = sanitized_id[:64]
        
        # Store the mapping
        id_mapping[sanitized_id] = original_promise_id
        
        promise_verbatim = row['promise_verbatim']
        promise_explain = row['promise_explain']
        
        # Create the user prompt
        user_prompt = create_user_prompt(promise_verbatim, promise_explain, user_prompt_template)
        
        # Create request object with sanitized ID
        request = Request(
            custom_id=sanitized_id,
            params=MessageCreateParamsNonStreaming(
                model="claude-3-7-sonnet-20250219",
                max_tokens=9551,
                temperature=0,
                system=system_message,
                messages=[
                    {
                        "role": "user",
                        "content": user_prompt
                    }
                ]
            )
        )
        
        requests.append(request)
    
    return requests, id_mapping

In [19]:
all_requests, id_mapping = create_batch_requests(transcripts_promises, system_message, user_prompt_template)
print(f"Created {len(all_requests)} batch requests")

Created 74017 batch requests


In [33]:
# Save the ID mapping for reference
id_mapping_df = pd.DataFrame(list(id_mapping.items()), columns=['sanitized_id', 'original_id'])
id_mapping_df.to_csv('id_mapping.csv', index=False)


In [20]:

claude_api_key = "..."
client_claude = anthropic.Anthropic(api_key=claude_api_key)

In [21]:
# Split into 10 chunks
total_requests = len(all_requests)
chunk_size = total_requests // 10
if total_requests % 10 > 0:
    chunk_size += 1  # Ensure we cover all requests


In [22]:

# Store batch IDs for later retrieval
batch_ids = []

# Submit each chunk without waiting for responses
for chunk_idx in range(10):
    start_idx = chunk_idx * chunk_size
    end_idx = min((chunk_idx + 1) * chunk_size, total_requests)
    
    if start_idx >= total_requests:
        break
        
    chunk_requests = all_requests[start_idx:end_idx]
    
    print(f"Submitting chunk {chunk_idx+1}/10: requests {start_idx} to {end_idx-1} ({len(chunk_requests)} requests)")
    
    try:
        # Submit the current chunk
        message_batch = client_claude.messages.batches.create(requests=chunk_requests)
        batch_ids.append(message_batch.id)
        print(f"Chunk {chunk_idx+1} submitted with ID: {message_batch.id}")
        
    except Exception as e:
        print(f"Error submitting chunk {chunk_idx+1}: {str(e)}")
    
    # Brief pause between submissions to avoid rate limits
    if chunk_idx < 9:
        time.sleep(2)


Submitting chunk 1/10: requests 0 to 7401 (7402 requests)
Chunk 1 submitted with ID: msgbatch_01AybVxLJRSrbqLVjE7wGd8b
Submitting chunk 2/10: requests 7402 to 14803 (7402 requests)
Chunk 2 submitted with ID: msgbatch_01JXLFhnPaP5RQEm9dhR1C1u
Submitting chunk 3/10: requests 14804 to 22205 (7402 requests)
Chunk 3 submitted with ID: msgbatch_01A4E5HXsztZkrSaNf1B8YRw
Submitting chunk 4/10: requests 22206 to 29607 (7402 requests)
Chunk 4 submitted with ID: msgbatch_01LxiLzHtVL6TEhoHTtKEyUU
Submitting chunk 5/10: requests 29608 to 37009 (7402 requests)
Chunk 5 submitted with ID: msgbatch_01MbssEAxdNzVt9FiA6ynNAx
Submitting chunk 6/10: requests 37010 to 44411 (7402 requests)
Chunk 6 submitted with ID: msgbatch_01Q8fM2mpQHMbABnFB4voiwf
Submitting chunk 7/10: requests 44412 to 51813 (7402 requests)
Chunk 7 submitted with ID: msgbatch_01Wu38P2jCwSXGDPGwcN1H3L
Submitting chunk 8/10: requests 51814 to 59215 (7402 requests)
Chunk 8 submitted with ID: msgbatch_0141doYCRHNF1wem13RwCyVi
Submitting chu

In [34]:
# Save the batch IDs for later checking
with open('batch_ids.txt', 'w') as f:
    for batch_id in batch_ids:
        f.write(f"{batch_id}\n")


In [29]:
message_batch = client_claude.messages.batches.retrieve(
    "msgbatch_01LhEjUe4FVPrJuMTU61hXqB",
)
print(f"Batch {message_batch.id} processing status is {message_batch.processing_status}")

Batch msgbatch_01LhEjUe4FVPrJuMTU61hXqB processing status is ended


In [16]:
print(message_batch)

MessageBatch(id='msgbatch_01CRtUZXdTeCty1GjGntL4G9', archived_at=None, cancel_initiated_at=None, created_at=datetime.datetime(2025, 3, 10, 21, 46, 10, 389581, tzinfo=datetime.timezone.utc), ended_at=datetime.datetime(2025, 3, 10, 21, 48, 38, 461605, tzinfo=TzInfo(UTC)), expires_at=datetime.datetime(2025, 3, 11, 21, 46, 10, 389581, tzinfo=datetime.timezone.utc), processing_status='ended', request_counts=MessageBatchRequestCounts(canceled=0, errored=7399, expired=0, processing=0, succeeded=0), results_url='https://api.anthropic.com/v1/messages/batches/msgbatch_01CRtUZXdTeCty1GjGntL4G9/results', type='message_batch')


In [30]:


# Stream results file in memory-efficient chunks, processing one at a time
for result in client_claude.messages.batches.results(
    "msgbatch_01HkcTjaV5uDC8jWR4ZsDV8d",
):
    match result.result.type:
        case "succeeded":
            print(f"Success! {result.custom_id}")
        case "errored":
            if result.result.error.type == "invalid_request":
                # Request body must be fixed before re-sending request
                print(f"Validation error {result.custom_id}")
            else:
                # Request can be retried directly
                print(f"Server error {result.custom_id}")
        case "expired":
            print(f"Request expired {result.custom_id}")


Success! 10353_45212_0_05
Success! 11600_45248_0_01
Success! 25338_45259_0_01
Success! 25338_45259_0_02
Success! 146017_45297_0_01
Success! 146017_45297_0_02
Success! 10840_45301_0_01
Success! 10840_45301_0_02
Success! 141913_45373_0_01
Success! 11234_45659_0_01
Success! 11234_45659_0_02
Success! 13354_45741_0_01
Success! 12689_45769_0_01
Success! 178507_45899_0_02
Success! 30463_46109_0_01
Success! 14225_46162_0_01
Success! 162129_46203_0_01
Success! 4926_46232_0_02
Success! 4926_46232_0_03
Success! 4926_46232_0_04
Success! 14172_46250_0_02
Success! 8007_46274_0_01
Success! 3007_46294_0_01
Success! 3007_46294_0_02
Success! 3007_46294_0_03
Success! 3007_46294_0_04
Success! 3007_46294_0_05
Success! 140541_46325_0_01
Success! 1045_46333_0_01
Success! 1045_46333_0_02
Success! 1045_46333_0_04
Success! 10035_46389_0_03
Success! 10035_46389_0_08
Success! 10903_46454_0_03
Success! 176701_46466_0_01
Success! 164633_46503_0_01
Success! 63138_46516_0_04
Success! 16710_46530_0_01
Success! 8245_46

In [36]:
# %%
# Function to download all batch results using the results iterator
def download_all_batch_results():
    print("Starting download of all batch results...")
    
    # Load the batch IDs
    try:
        with open('batch_ids.txt', 'r') as f:
            batch_ids = [line.strip() for line in f.readlines()]
        print(f"Loaded {len(batch_ids)} batch IDs from batch_ids.txt")
    except FileNotFoundError:
        print("Error: batch_ids.txt not found.")
        return None
    
    # Load the ID mapping
    try:
        id_mapping_df = pd.read_csv('id_mapping.csv')
        id_mapping = dict(zip(id_mapping_df['sanitized_id'], id_mapping_df['original_id']))
        print(f"Loaded mapping for {len(id_mapping)} promise IDs")
    except FileNotFoundError:
        print("Error: id_mapping.csv not found.")
        return None
    
    # Collect all responses
    promises_keywords = {}
    total_completed = 0
    total_failed = 0
    total_expired = 0
    
    for batch_idx, batch_id in enumerate(batch_ids):
        print(f"\nProcessing batch {batch_idx+1}/{len(batch_ids)} (ID: {batch_id})")
        batch_completed = 0
        batch_failed = 0
        batch_expired = 0
        
        try:
            # Use the results iterator to process each result in the batch
            for result in client_claude.messages.batches.results(batch_id):
                sanitized_id = result.custom_id
                
                # Ensure the sanitized ID exists in our mapping
                if sanitized_id not in id_mapping:
                    print(f"Warning: Sanitized ID {sanitized_id} not found in mapping")
                    continue
                
                original_id = id_mapping[sanitized_id]
                
                match result.result.type:
                    case "succeeded":
                        # Extract the content from successful result
                        content = result.result.message.content[0].text
                        promises_keywords[original_id] = content
                        batch_completed += 1
                        print(f"Success: {original_id}")
                    
                    case "errored":
                        if result.result.error.type == "invalid_request":
                            error_msg = f"Validation error: {result.result.error.message}"
                        else:
                            error_msg = f"Server error: {result.result.error.message}"
                        
                        promises_keywords[original_id] = "ERROR: " + error_msg
                        batch_failed += 1
                        print(f"Error for {original_id}: {error_msg}")
                    
                    case "expired":
                        promises_keywords[original_id] = "ERROR: Request expired"
                        batch_expired += 1
                        print(f"Request expired: {original_id}")
            
            total_completed += batch_completed
            total_failed += batch_failed
            total_expired += batch_expired
            
            print(f"Batch {batch_idx+1} processed: {batch_completed} succeeded, {batch_failed} failed, {batch_expired} expired")
        
        except Exception as e:
            print(f"Error processing batch {batch_idx+1}: {str(e)}")
    
    print(f"\nDownload complete:")
    print(f"- Total batches: {len(batch_ids)}")
    print(f"- Total promises processed: {total_completed + total_failed + total_expired}")
    print(f"- Successful: {total_completed}")
    print(f"- Failed: {total_failed}")
    print(f"- Expired: {total_expired}")
    
    # Save the results
    if promises_keywords:
        results_df = pd.DataFrame(list(promises_keywords.items()), columns=['promise_id', 'keywords'])
        results_df.to_csv('promises_keywords_results.csv', index=False)
        print(f"Results saved to promises_keywords_results.csv")
        
        # Also create a merged dataset with original data
        try:
            merged_df = pd.merge(
                transcripts_promises, 
                results_df,
                on='promise_id',
                how='left'
            )
            merged_df.to_csv('promises_with_keywords.csv', index=False)
            print(f"Merged dataset saved to promises_with_keywords.csv")
        except Exception as e:
            print(f"Error creating merged dataset: {str(e)}")
    
    return promises_keywords



In [37]:

# Run the function to download all results
all_results = download_all_batch_results()

Starting download of all batch results...
Loaded 10 batch IDs from batch_ids.txt
Loaded mapping for 74017 promise IDs

Processing batch 1/10 (ID: msgbatch_01AybVxLJRSrbqLVjE7wGd8b)
Success: 10353_45212.0_05
Success: 11600_45248.0_01
Success: 25338_45259.0_01
Success: 25338_45259.0_02
Success: 146017_45297.0_01
Success: 146017_45297.0_02
Success: 10840_45301.0_01
Success: 10840_45301.0_02
Success: 141913_45373.0_01
Success: 11234_45659.0_01
Success: 11234_45659.0_02
Success: 13354_45741.0_01
Success: 12689_45769.0_01
Success: 178507_45899.0_02
Success: 30463_46109.0_01
Success: 14225_46162.0_01
Success: 162129_46203.0_01
Success: 4926_46232.0_02
Success: 4926_46232.0_03
Success: 4926_46232.0_04
Success: 14172_46250.0_02
Success: 8007_46274.0_01
Success: 3007_46294.0_01
Success: 3007_46294.0_02
Success: 3007_46294.0_03
Success: 3007_46294.0_04
Success: 3007_46294.0_05
Success: 140541_46325.0_01
Success: 1045_46333.0_01
Success: 1045_46333.0_02
Success: 1045_46333.0_04
Success: 10035_4638

# Explore the keywords

In [41]:
# opent the promises_with_keywords.csv file
promises_with_keywords = pd.read_csv('promises_with_keywords.csv')

# print the first 5 rows of the dataframe
promises_with_keywords.head()



Unnamed: 0,transcriptid,companyname,gvkey,transcript_date,speaker_name,presentation_len,year,full_transcript_len,promise_verbatim,promise_explain,promise_id,promise_horizon_months,keywords
0,45212.0,"Team, Inc.",10353,2010-01-06,Philip Hawk,166.0,2010,12012,We have always been and continue to be fully c...,The CEO asserts a continuous commitment to eth...,10353_45212.0_05,,Business Ethics; Corporate Integrity; Stakehol...
1,45248.0,"Worthington Industries, Inc.",11600,2010-01-06,John McConnell,217.0,2010,3332,But I assure you we will remain vigilant in se...,The CEO is committing that the company will ma...,11600_45248.0_01,,Economic Monitoring; Risk Management; Continge...
2,45259.0,Bed Bath & Beyond Inc.,25338,2010-01-06,Steven Temares,819.0,2010,5320,We always look for ways to enhance our custome...,The CEO is affirming a long‐term commitment to...,25338_45259.0_01,,Customer Experience; Market Leadership; Intern...
3,45259.0,Bed Bath & Beyond Inc.,25338,2010-01-06,Steven Temares,819.0,2010,5320,While we continue to review and prioritize our...,The CEO is committing to allocate necessary ca...,25338_45259.0_02,,Capital Investment; Store Expansion; IT Enhanc...
4,45297.0,"Acuity Brands, Inc.",146017,2010-01-06,Vernon Nagel,781.0,2010,12867,"However, as I have said before, we will defend...",The CEO is committing the company to actively ...,146017_45297.0_01,,Competitive Defense; Pricing Strategy; Market ...


In [42]:
# split the keywords into a list and each one into a new column
promises_with_keywords = promises_with_keywords.assign(keywords=promises_with_keywords['keywords'].str.split(';'))

# print the first 5 rows of the dataframe
promises_with_keywords.head()



Unnamed: 0,transcriptid,companyname,gvkey,transcript_date,speaker_name,presentation_len,year,full_transcript_len,promise_verbatim,promise_explain,promise_id,promise_horizon_months,keywords
0,45212.0,"Team, Inc.",10353,2010-01-06,Philip Hawk,166.0,2010,12012,We have always been and continue to be fully c...,The CEO asserts a continuous commitment to eth...,10353_45212.0_05,,"[Business Ethics, Corporate Integrity, Stake..."
1,45248.0,"Worthington Industries, Inc.",11600,2010-01-06,John McConnell,217.0,2010,3332,But I assure you we will remain vigilant in se...,The CEO is committing that the company will ma...,11600_45248.0_01,,"[Economic Monitoring, Risk Management, Conti..."
2,45259.0,Bed Bath & Beyond Inc.,25338,2010-01-06,Steven Temares,819.0,2010,5320,We always look for ways to enhance our custome...,The CEO is affirming a long‐term commitment to...,25338_45259.0_01,,"[Customer Experience, Market Leadership, Int..."
3,45259.0,Bed Bath & Beyond Inc.,25338,2010-01-06,Steven Temares,819.0,2010,5320,While we continue to review and prioritize our...,The CEO is committing to allocate necessary ca...,25338_45259.0_02,,"[Capital Investment, Store Expansion, IT Enh..."
4,45297.0,"Acuity Brands, Inc.",146017,2010-01-06,Vernon Nagel,781.0,2010,12867,"However, as I have said before, we will defend...",The CEO is committing the company to actively ...,146017_45297.0_01,,"[Competitive Defense, Pricing Strategy, Mark..."


In [43]:
# Convert the list column to a DataFrame
new_cols = pd.DataFrame(promises_with_keywords['keywords'].tolist(), index=promises_with_keywords.index)


In [44]:
new_cols.head()

Unnamed: 0,0,1,2,3,4
0,Business Ethics,Corporate Integrity,Stakeholder Transparency,,
1,Economic Monitoring,Risk Management,Contingency Planning,,
2,Customer Experience,Market Leadership,International Expansion,,
3,Capital Investment,Store Expansion,IT Enhancement,,
4,Competitive Defense,Pricing Strategy,Market Position,,


In [45]:
# keep only the first three columns
new_cols = new_cols.iloc[:, :3]

# rename the columns
new_cols.columns = ['keyword1', 'keyword2', 'keyword3']

# Concatenate with the original DataFrame (or assign directly)

In [47]:

# Concatenate with the original DataFrame (or assign directly)
promises_with_keywords = pd.concat([promises_with_keywords, new_cols], axis=1)

In [50]:
promises_with_keywords.head(20)

Unnamed: 0,transcriptid,companyname,gvkey,transcript_date,speaker_name,presentation_len,year,full_transcript_len,promise_verbatim,promise_explain,promise_id,promise_horizon_months,keywords,keyword1,keyword2,keyword3
0,45212.0,"Team, Inc.",10353,2010-01-06,Philip Hawk,166.0,2010,12012,We have always been and continue to be fully c...,The CEO asserts a continuous commitment to eth...,10353_45212.0_05,,"[Business Ethics, Corporate Integrity, Stake...",Business Ethics,Corporate Integrity,Stakeholder Transparency
1,45248.0,"Worthington Industries, Inc.",11600,2010-01-06,John McConnell,217.0,2010,3332,But I assure you we will remain vigilant in se...,The CEO is committing that the company will ma...,11600_45248.0_01,,"[Economic Monitoring, Risk Management, Conti...",Economic Monitoring,Risk Management,Contingency Planning
2,45259.0,Bed Bath & Beyond Inc.,25338,2010-01-06,Steven Temares,819.0,2010,5320,We always look for ways to enhance our custome...,The CEO is affirming a long‐term commitment to...,25338_45259.0_01,,"[Customer Experience, Market Leadership, Int...",Customer Experience,Market Leadership,International Expansion
3,45259.0,Bed Bath & Beyond Inc.,25338,2010-01-06,Steven Temares,819.0,2010,5320,While we continue to review and prioritize our...,The CEO is committing to allocate necessary ca...,25338_45259.0_02,,"[Capital Investment, Store Expansion, IT Enh...",Capital Investment,Store Expansion,IT Enhancement
4,45297.0,"Acuity Brands, Inc.",146017,2010-01-06,Vernon Nagel,781.0,2010,12867,"However, as I have said before, we will defend...",The CEO is committing the company to actively ...,146017_45297.0_01,,"[Competitive Defense, Pricing Strategy, Mark...",Competitive Defense,Pricing Strategy,Market Position
5,45297.0,"Acuity Brands, Inc.",146017,2010-01-06,Vernon Nagel,781.0,2010,12867,"As part of this strategy, we will continue to ...",The CEO is promising that the company will mai...,146017_45297.0_02,,"[Customer Service, Productivity Improvement, ...",Customer Service,Productivity Improvement,Product Innovation
6,45301.0,UniFirst Corporation,10840,2010-01-06,Ronald Croatti,1176.0,2010,7676,"So going forward, to ensure ongoing profitabil...","In this statement, the CEO commits the company...",10840_45301.0_01,,"[Cost Controls, Operational Efficiency, Prof...",Cost Controls,Operational Efficiency,Profitability Maintenance
7,45301.0,UniFirst Corporation,10840,2010-01-06,Ronald Croatti,1176.0,2010,7676,It remains unclear exactly how many years it w...,"Here, the CEO pledges to improve the company's...",10840_45301.0_02,,"[Earnings Maximization, Shareholder Value, C...",Earnings Maximization,Shareholder Value,Customer Loyalty
8,45373.0,Global Payments Inc.,141913,2010-01-07,Paul Garcia,346.0,2010,2994,I look forward to executing upon the many oppo...,"In this statement, the CEO expresses his inten...",141913_45373.0_01,,"[Global Expansion, Market Reach, Growth Oppo...",Global Expansion,Market Reach,Growth Opportunities
9,45659.0,WD-40 Company,11234,2010-01-11,Gary O. Ridge,2023.0,2010,13580,By doing so we will provide attractive economi...,The CEO states that through the execution of t...,11234_45659.0_01,,"[Economic Returns, Stakeholder Value, Employ...",Economic Returns,Stakeholder Value,Employee Opportunity


In [49]:
promises_with_keywords.to_csv('promises_with_keywords_v2.csv', index=False)

In [8]:
promises_with_keywords = pd.read_csv('promises_with_keywords_v2.csv')

In [9]:
import numpy as np
import openai
import os
from bertopic.backend import OpenAIBackend


  from .autonotebook import tqdm as notebook_tqdm


In [53]:

def compute_and_save_keyword_embeddings(df, 
                                        keyword_columns=["keyword1", "keyword2", "keyword3"], 
                                        openai_api_key="YOUR_OPENAI_API_KEY",
                                        batch_size=100,
                                        output_file="keyword_embeddings.npy"):
    """
    Computes OpenAI embeddings for unique keywords in the specified columns of the DataFrame,
    and saves the resulting dictionary mapping each unique keyword to its embedding.
    
    :param df: pd.DataFrame containing your keywords data
    :param keyword_columns: List of column names containing keywords
    :param openai_api_key: Your OpenAI API key
    :param batch_size: Number of keywords to process per batch
    :param output_file: File path to save the embeddings dictionary
    :return: Dictionary with unique keyword as key and embedding as value
    """
    # Set OpenAI API key
    openai.api_key = openai_api_key
    
    # Prepare your embedding model (adjust as needed)
    openai_client = openai.OpenAI(api_key=openai_api_key)
    embedding_model = OpenAIBackend(openai_client, "text-embedding-3-small")
    
    # Combine keywords from specified columns into a set to avoid duplicates
    all_keywords = set()
    for col in keyword_columns:
        # Drop any missing values and add unique keywords from each column
        all_keywords.update(df[col].dropna().unique().tolist())
    
    # Convert the set back to a list for batching
    unique_keywords = list(all_keywords)
    
    # Dictionary to store the embeddings
    embeddings_dict = {}
    
    # Process embeddings in batches
    for i in range(0, len(unique_keywords), batch_size):
        batch = unique_keywords[i:i+batch_size]
        print(f"Processing batch {i//batch_size + 1}/{(len(unique_keywords)-1)//batch_size + 1}")
        
        # Get embeddings for the batch (assuming your embedding_model supports this method)
        batch_embeddings = embedding_model.embed_documents(batch)
        
        # Map each keyword in the batch to its embedding
        for keyword, embedding in zip(batch, batch_embeddings):
            embeddings_dict[keyword] = embedding
            
    # Save the dictionary to a file (using np.save for simplicity)
    np.save(output_file, embeddings_dict)
    
    # Optionally, also save the list of keywords to a separate file for reference
    keywords_file = os.path.splitext(output_file)[0] + "_keywords.txt"
    with open(keywords_file, 'w', encoding='utf-8') as f:
        for keyword in unique_keywords:
            f.write(keyword + "\n")
    
    print(f"Keyword embeddings saved to {output_file}")
    print(f"Keywords saved to {keywords_file}")
    
    return embeddings_dict


In [54]:
embeddings_dict = compute_and_save_keyword_embeddings(promises_with_keywords, 
                                        keyword_columns=["keyword1", "keyword2", "keyword3"], 
                                        openai_api_key="...",
                                        batch_size=100,
                                        output_file="keyword_embeddings.npy")

Processing batch 1/569
Processing batch 2/569
Processing batch 3/569
Processing batch 4/569
Processing batch 5/569
Processing batch 6/569
Processing batch 7/569
Processing batch 8/569
Processing batch 9/569
Processing batch 10/569
Processing batch 11/569
Processing batch 12/569
Processing batch 13/569
Processing batch 14/569
Processing batch 15/569
Processing batch 16/569
Processing batch 17/569
Processing batch 18/569
Processing batch 19/569
Processing batch 20/569
Processing batch 21/569
Processing batch 22/569
Processing batch 23/569
Processing batch 24/569
Processing batch 25/569
Processing batch 26/569
Processing batch 27/569
Processing batch 28/569
Processing batch 29/569
Processing batch 30/569
Processing batch 31/569
Processing batch 32/569
Processing batch 33/569
Processing batch 34/569
Processing batch 35/569
Processing batch 36/569
Processing batch 37/569
Processing batch 38/569
Processing batch 39/569
Processing batch 40/569
Processing batch 41/569
Processing batch 42/569
P

In [10]:
embeddings_dict = np.load('keyword_embeddings.npy', allow_pickle=True).item()

In [11]:
# Now use UMAP to reduce the dimensionality of the embeddings
import umap

# Initialize UMAP
umap_model = umap.UMAP(n_components=250, random_state=42)

# Fit UMAP on the embeddings
umap_result = umap_model.fit_transform(list(embeddings_dict.values()))

# Create a DataFrame with the UMAP results
umap_df = pd.DataFrame(umap_result, index=embeddings_dict.keys())



In [12]:
umap_df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
Portfolio Stabilization,9.944758,0.060961,10.00098,0.863024,0.382711,0.680412,9.569309,9.454203,5.516013,0.468586,...,5.345944,6.608919,5.883682,6.441091,4.761364,7.465069,2.837489,3.462178,4.374538,5.858528
Product Alignment,9.933063,0.065522,9.977373,0.89227,0.37106,0.764121,9.63131,9.50183,5.705505,0.468745,...,5.316481,6.513677,5.814537,6.540111,4.966141,7.493671,2.754188,3.5117,4.382485,5.761862
Anti-Racism Training,9.9451,0.044888,9.990128,0.896691,0.312486,0.651743,9.568511,9.494914,6.107701,0.527813,...,5.45008,6.559371,6.157829,6.511379,5.029871,7.34708,2.908067,3.465641,4.501915,5.816816
Animal Health Leadership,9.989934,0.045643,9.973797,0.944881,0.289391,0.527113,9.460776,9.468323,6.988937,0.42424,...,5.423872,6.62943,6.257483,6.523567,5.117696,7.203232,3.057076,3.420207,4.559132,5.885881
Drilling Site Utilization,9.99148,0.124835,9.980763,0.884709,0.480605,0.537206,9.308726,9.313994,6.635634,0.2108,...,5.405833,6.734344,6.216671,6.34404,4.638164,7.327089,3.1731,3.526688,4.279809,6.021183


In [13]:
# how many rows in umap_df
len(umap_df)

56825

In [14]:
umap_df.iloc[0]

0       9.944758
1       0.060961
2      10.000980
3       0.863024
4       0.382711
         ...    
245     7.465069
246     2.837489
247     3.462178
248     4.374538
249     5.858528
Name:  Portfolio Stabilization, Length: 250, dtype: float32

In [15]:
umap_df.loc[' Anti-Racism Training']

0      9.945100
1      0.044888
2      9.990128
3      0.896691
4      0.312486
         ...   
245    7.347080
246    2.908067
247    3.465641
248    4.501915
249    5.816816
Name:  Anti-Racism Training, Length: 250, dtype: float32

In [16]:
# function to find the most similar keywords to a given keyword
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [17]:
def find_most_similar_keywords(keyword, umap_df, n=5):
    umap_df_copy = umap_df.copy()

    # get the embedding for the given keyword from umap_df instead of embeddings_dict
    keyword_embedding = umap_df.loc[keyword].values
    
    # find the most similar keywords using cosine similarity
    umap_df_copy['similarity'] = umap_df_copy.apply(lambda x: cosine_similarity([keyword_embedding], [x.values])[0][0], axis=1)
    
    # return the top n keywords
    return umap_df_copy.sort_values(by='similarity', ascending=False).head(n)


In [18]:

# test the function
find_most_similar_keywords('Animal Health Leadership',  umap_df, n=5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,241,242,243,244,245,246,247,248,249,similarity
Animal Health Leadership,9.989934,0.045643,9.973797,0.944881,0.289391,0.527113,9.460776,9.468323,6.988937,0.42424,...,6.62943,6.257483,6.523567,5.117696,7.203232,3.057076,3.420207,4.559132,5.885881,1.0
Animal Health,9.990054,0.045642,9.973788,0.945009,0.289388,0.527034,9.4618,9.468354,6.987839,0.424904,...,6.62901,6.25771,6.523373,5.119055,7.203191,3.057008,3.420404,4.559151,5.885593,1.0
Animal Healthcare Industry,9.990163,0.045883,9.973629,0.94531,0.289812,0.52786,9.461359,9.468475,6.988872,0.427543,...,6.628692,6.255074,6.523554,5.118141,7.20394,3.056383,3.419679,4.558122,5.885274,1.0
Animal Health Platform,9.990031,0.045537,9.973769,0.945166,0.289455,0.527429,9.46121,9.468485,6.989216,0.425332,...,6.629092,6.257442,6.523567,5.119113,7.203032,3.057093,3.420106,4.559216,5.885839,1.0
Animal Health Solutions,9.989773,0.045604,9.973807,0.944998,0.289619,0.52837,9.462556,9.46857,6.980426,0.425131,...,6.628156,6.255199,6.523452,5.118241,7.204754,3.055346,3.420547,4.557997,5.885112,1.0


In [19]:
# Now let's do a clustering algorithm to cluster the keywords into topics
# using HDBSCAN
from hdbscan import HDBSCAN


In [20]:



# Fit HDBSCAN with soft clustering enabled
hdbscan_model = HDBSCAN(min_cluster_size=100,
                            prediction_data=True)
hdbscan_model.fit(umap_df)  # embeddings = your NxD array


In [21]:

# Outlier points will still have label -1, 
# but we can get membership scores for all clusters:
import hdbscan.prediction as hdbscan_prediction

soft_labels, strengths = hdbscan_prediction.approximate_predict(
    hdbscan_model, umap_df
)


In [29]:

# 'soft_labels[i]' is the cluster label assigned to point i 
# (or -1 if it's strongly an outlier).
# 'strengths[i]' is a membership probability for that assignment.

# If you just want a hard assignment for outliers:
for i in range(len(hdbscan_model.labels_)):
    if hdbscan_model.labels_[i] == -1:
        hdbscan_model.labels_[i] = soft_labels[i]


In [30]:

# get the cluster labels
cluster_labels = hdbscan_model.labels_



In [58]:
# what percent of cluster_labels are -1
len(cluster_labels[cluster_labels == -1]) / len(cluster_labels)


0.5476286845578531

In [31]:
# Get the cluster labels from the model (excluding noise points, which are labeled -1)
clusters = hdbscan_model.labels_
unique_clusters = np.unique(clusters[clusters != -1])  # Exclude noise

centroids = {}
for cluster in unique_clusters:
    # Select points that belong to the current cluster
    points_in_cluster = umap_df[clusters == cluster]
    # Compute the centroid as the mean of the points along each dimension
    centroids[cluster] = points_in_cluster.mean(axis=0)

print(centroids)

{0: 0      9.989385
1      0.043507
2      9.983323
3      0.689167
4      0.569603
         ...   
245    7.326698
246    2.962568
247    3.475620
248    4.404440
249    5.816343
Length: 250, dtype: float32, 1: 0      9.975521
1      0.082665
2      9.987143
3      0.906522
4      0.288826
         ...   
245    7.303524
246    2.997738
247    3.407392
248    4.371737
249    5.821356
Length: 250, dtype: float32, 2: 0      10.004759
1       0.062362
2       9.996655
3       0.915369
4       0.263282
         ...    
245     7.298195
246     2.989266
247     3.301792
248     4.506170
249     5.757980
Length: 250, dtype: float32, 3: 0      10.012821
1       0.118193
2       9.995029
3       1.027670
4       0.496714
         ...    
245     7.388079
246     2.942404
247     3.314292
248     4.245853
249     5.834552
Length: 250, dtype: float32, 4: 0      9.960823
1      0.060980
2      9.985805
3      0.921932
4      0.501537
         ...   
245    7.439792
246    2.899329
247    3.56308

In [32]:
# how many clusters are there?
len(unique_clusters)



117

In [33]:
def find_closest_keywords(centroid_embedding, umap_df, n=5):
    umap_df_copy = umap_df.copy()
    
    centroid_embedding = np.array(centroid_embedding).reshape(1, -1)
    # find the most similar keywords using cosine similarity
    umap_df_copy['similarity'] = umap_df_copy.apply(lambda x: cosine_similarity(centroid_embedding, [x.values])[0][0], axis=1)
    
    # return the top n keywords
    return umap_df_copy.sort_values(by='similarity', ascending=False).head(n)



In [34]:
find_closest_keywords(centroids[13], umap_df, n=50)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,241,242,243,244,245,246,247,248,249,similarity
Facility Closure Preparation,10.000917,0.169224,9.979281,0.893913,0.426965,0.619442,9.364647,9.380095,6.349282,0.454425,...,6.654347,6.101491,6.223671,4.677057,7.418909,3.0003,3.41663,4.220142,5.910389,1.0
Manufacturing Closure,10.00063,0.169083,9.978893,0.893793,0.424765,0.614122,9.360551,9.380047,6.366021,0.442983,...,6.656155,6.100358,6.227611,4.677202,7.418594,3.001859,3.419253,4.222125,5.912386,1.0
Plant Closures,10.000777,0.168573,9.979174,0.893894,0.425485,0.616782,9.358087,9.379754,6.358941,0.44909,...,6.656121,6.103097,6.223732,4.675292,7.417872,3.002289,3.416709,4.220807,5.912496,1.0
Operational Closure,10.001079,0.169926,9.979454,0.893515,0.427939,0.620008,9.363116,9.379417,6.344651,0.454252,...,6.655048,6.101523,6.220895,4.674013,7.419889,3.00039,3.416314,4.218307,5.911335,1.0
Manufacturing Closures,10.000535,0.168539,9.978943,0.8939,0.426494,0.615568,9.360683,9.379937,6.365591,0.444903,...,6.655941,6.101776,6.227531,4.677729,7.418042,3.002715,3.41936,4.221867,5.912952,1.0
Plant Closures,10.000651,0.16842,9.979187,0.894035,0.426255,0.617257,9.357822,9.379714,6.358561,0.449553,...,6.65598,6.103711,6.224004,4.675982,7.417525,3.002932,3.416953,4.220752,5.912961,1.0
Facility Closure,10.000811,0.169455,9.979248,0.894345,0.428281,0.62093,9.366587,9.380189,6.34728,0.456088,...,6.6535,6.102472,6.22523,4.679254,7.418166,3.001058,3.417172,4.22051,5.910137,1.0
Facility Closure,10.001279,0.167851,9.979184,0.894235,0.424072,0.618707,9.365534,9.380785,6.352575,0.455344,...,6.654187,6.100745,6.224859,4.678004,7.418212,2.998981,3.415281,4.222254,5.908319,1.0
Plant Closure,10.001213,0.170957,9.979509,0.893493,0.429695,0.620798,9.362297,9.378835,6.345802,0.455835,...,6.655241,6.10346,6.219325,4.673444,7.419403,3.001765,3.416191,4.217104,5.912138,0.999999
Business Closure,10.001418,0.17054,9.979465,0.8938,0.428667,0.621013,9.365085,9.379391,6.34632,0.458956,...,6.654453,6.103009,6.220065,4.675596,7.418978,3.000793,3.415457,4.218211,5.910375,0.999999


In [35]:
# get the top 100 keywords for each cluster
top_keywords = {}
for cluster in unique_clusters:
    top_keywords[cluster] = find_closest_keywords(centroids[cluster], umap_df, n=100)

# print the top 100 keywords for each cluster
print(top_keywords)

{0:                                    0         1         2         3         4   
 Revenue Synergy Targets    9.989699  0.043148  9.983479  0.688358  0.569929  \
Cost Synergy Target         9.989035  0.043804  9.983086  0.690067  0.568932   
Innovation Synergy          9.989630  0.043358  9.983462  0.689919  0.568384   
Cost Synergy Targets        9.988908  0.043927  9.983030  0.690261  0.568979   
 Synergy Extraction         9.989616  0.043232  9.983574  0.685281  0.574495   
...                              ...       ...       ...       ...       ...   
Growth Synergies            9.991096  0.041803  9.984309  0.685423  0.571178   
 Merger Synergies           9.990876  0.042140  9.984230  0.686307  0.571531   
 Marine Business Synergies  9.991011  0.041860  9.984270  0.685245  0.571481   
Merger Synergies            9.990882  0.042158  9.984238  0.686414  0.571585   
Synergies                   9.991003  0.041888  9.984256  0.685612  0.571110   

                                   

In [36]:
test = top_keywords[13]

In [37]:
test.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,241,242,243,244,245,246,247,248,249,similarity
Facility Closure Preparation,10.000917,0.169224,9.979281,0.893913,0.426965,0.619442,9.364647,9.380095,6.349282,0.454425,...,6.654347,6.101491,6.223671,4.677057,7.418909,3.0003,3.41663,4.220142,5.910389,1.0
Manufacturing Closure,10.00063,0.169083,9.978893,0.893793,0.424765,0.614122,9.360551,9.380047,6.366021,0.442983,...,6.656155,6.100358,6.227611,4.677202,7.418594,3.001859,3.419253,4.222125,5.912386,1.0
Plant Closures,10.000777,0.168573,9.979174,0.893894,0.425485,0.616782,9.358087,9.379754,6.358941,0.44909,...,6.656121,6.103097,6.223732,4.675292,7.417872,3.002289,3.416709,4.220807,5.912496,1.0
Operational Closure,10.001079,0.169926,9.979454,0.893515,0.427939,0.620008,9.363116,9.379417,6.344651,0.454252,...,6.655048,6.101523,6.220895,4.674013,7.419889,3.00039,3.416314,4.218307,5.911335,1.0
Manufacturing Closures,10.000535,0.168539,9.978943,0.8939,0.426494,0.615568,9.360683,9.379937,6.365591,0.444903,...,6.655941,6.101776,6.227531,4.677729,7.418042,3.002715,3.41936,4.221867,5.912952,1.0
Plant Closures,10.000651,0.16842,9.979187,0.894035,0.426255,0.617257,9.357822,9.379714,6.358561,0.449553,...,6.65598,6.103711,6.224004,4.675982,7.417525,3.002932,3.416953,4.220752,5.912961,1.0
Facility Closure,10.000811,0.169455,9.979248,0.894345,0.428281,0.62093,9.366587,9.380189,6.34728,0.456088,...,6.6535,6.102472,6.22523,4.679254,7.418166,3.001058,3.417172,4.22051,5.910137,1.0
Facility Closure,10.001279,0.167851,9.979184,0.894235,0.424072,0.618707,9.365534,9.380785,6.352575,0.455344,...,6.654187,6.100745,6.224859,4.678004,7.418212,2.998981,3.415281,4.222254,5.908319,1.0
Plant Closure,10.001213,0.170957,9.979509,0.893493,0.429695,0.620798,9.362297,9.378835,6.345802,0.455835,...,6.655241,6.10346,6.219325,4.673444,7.419403,3.001765,3.416191,4.217104,5.912138,0.999999
Business Closure,10.001418,0.17054,9.979465,0.8938,0.428667,0.621013,9.365085,9.379391,6.34632,0.458956,...,6.654453,6.103009,6.220065,4.675596,7.418978,3.000793,3.415457,4.218211,5.910375,0.999999


In [38]:
list(test.index)

[' Facility Closure Preparation',
 'Manufacturing Closure',
 'Plant Closures',
 ' Operational Closure',
 'Manufacturing Closures',
 ' Plant Closures',
 ' Facility Closure',
 'Facility Closure',
 'Plant Closure',
 ' Business Closure',
 ' Facility Closings',
 ' Manufacturing Closure',
 'Facility Closures',
 'International Site Closure',
 ' Mine Closure',
 'Business Closure',
 ' Plant Closure',
 ' Facility Closures',
 'Office Closure Policy',
 ' Site Closures',
 'Site Closure',
 ' Holiday Closure',
 ' Strategic Closures',
 'Hospital Closure',
 'Office Closure',
 ' Temporary Closure Support',
 'Office Closures',
 ' Site Closure',
 ' Store Closure',
 'Mine Closure',
 ' Branch Closures',
 ' TSA Closure',
 ' Wholesale Closure',
 'Bank Closure',
 ' Location Closures',
 'Branch Closings',
 'Mill Closure',
 ' Controlled Closure',
 ' Store Closures',
 ' No Further Closures',
 'Store Closure Plan',
 'Retail Closures',
 'Store Closings',
 'Store Closure',
 'Fund Closure',
 ' Phased Closings',
 ' Co

In [39]:
def get_keywords_string(cluster_num):
    """
    Get a semicolon-separated string of keywords for a specific cluster.
    
    Args:
        cluster_num: The cluster number to get keywords for
        
    Returns:
        A string with all keywords for the cluster joined by semicolons
    """
    if cluster_num not in top_keywords:
        return ""
    
    return "; ".join(top_keywords[cluster_num].index.tolist())

get_keywords_string(13)

' Facility Closure Preparation; Manufacturing Closure; Plant Closures;  Operational Closure; Manufacturing Closures;  Plant Closures;  Facility Closure; Facility Closure; Plant Closure;  Business Closure;  Facility Closings;  Manufacturing Closure; Facility Closures; International Site Closure;  Mine Closure; Business Closure;  Plant Closure;  Facility Closures; Office Closure Policy;  Site Closures; Site Closure;  Holiday Closure;  Strategic Closures; Hospital Closure; Office Closure;  Temporary Closure Support; Office Closures;  Site Closure;  Store Closure; Mine Closure;  Branch Closures;  TSA Closure;  Wholesale Closure; Bank Closure;  Location Closures; Branch Closings; Mill Closure;  Controlled Closure;  Store Closures;  No Further Closures; Store Closure Plan; Retail Closures; Store Closings; Store Closure; Fund Closure;  Phased Closings;  Competitor Store Closings; Branch Closure; Restaurant Closures; Store Closures;  Ride Closure; Factory Closure; Asset Closure; Branch Closure

In [40]:
cluster_description_prompt = """
Instruction:
I have used LLMs to identify CEO promises from the text of earnings calls transcripts. Afterwards, I asked the model to give three keywords for each promise. I want to create a classifciation. I have used clustering algorithm to cluster those keywords. Now below, you see keywords for one of the clusters. I want you to help me label this cluster.

Output Format:
The label for the cluster. Nothing else. No explanation, no periods at the end.

Keywords:
{keywords_string}

"""

In [41]:
from openai import OpenAI

client = OpenAI(
  api_key="...", 
)


def get_topic_description(keywords_string):
    # run each set of keywords through openai to get the topic description
    response = client.chat.completions.create(
        model="o3-mini-2025-01-31",
        reasoning_effort="high",
        seed=2025,
        messages=[
            {"role": "user", "content": cluster_description_prompt.format(keywords_string=keywords_string)}]
    )
    return response.choices[0].message.content

In [43]:
# test for the first cluster
get_topic_description(get_keywords_string(13))

Closures and Shutdowns


'Closures and Shutdowns'

In [45]:
# run for all rows in top_keywords
cluster_descriptions = {}
for i in range(len(top_keywords)):
    cluster_descriptions[i] = get_topic_description(get_keywords_string(i))
    print(f"Processed cluster {i} \n")
    print(cluster_descriptions[i])

# save in a dataframe
cluster_descriptions_df = pd.DataFrame(cluster_descriptions, index=["description"])



Processed cluster 0 

Synergy
Processed cluster 1 

Regional Strategic Initiatives
Processed cluster 2 

Diversification Strategies
Processed cluster 3 

Divestitures
Processed cluster 4 

Fulfillment
Processed cluster 5 

Franchise Growth and Development
Processed cluster 6 

Timeline Commitments
Processed cluster 7 

Personalization and Customization
Processed cluster 8 

Update Commitments
Processed cluster 9 

Channel Strategy and Expansion
Processed cluster 10 

Omnichannel Strategy
Processed cluster 11 

Scheduling and Delay Management
Processed cluster 12 

Rollout Initiatives
Processed cluster 13 

Closures and Shutdowns
Processed cluster 14 

Supply Chain Transformation and Resilience
Processed cluster 15 

Continuity
Processed cluster 16 

Completion Milestones
Processed cluster 17 

Dividend Policy
Processed cluster 18 

Corporate Restructuring
Processed cluster 19 

Capacity Expansion
Processed cluster 20 

Capabilities Development and Enhancement
Processed cluster 21 

Sus

In [46]:
clusters = hdbscan_model.labels_

In [47]:
keyword2cluster = dict(zip(umap_df.index, clusters))

In [48]:
cluster_descriptions_df = pd.DataFrame(cluster_descriptions, index=["description"])


In [49]:

cluster_desc_dict = cluster_descriptions_df.loc["description"].to_dict()

In [50]:
def get_cluster_label(cluster_id):
    if cluster_id == -1:
        return "Unclustered"
    else:
        return cluster_desc_dict.get(cluster_id, "Unclustered")  # fallback


In [54]:
df = pd.read_csv("promises_with_keywords_v2.csv")


In [56]:
def assign_label(keyword):
    # Handle missing/empty keywords
    if pd.isna(keyword) or not keyword.strip():
        return "Unclustered"
    # Get the cluster ID from the dictionary (default to -1 if keyword not in dictionary)
    cluster_id = keyword2cluster.get(keyword, -1)
    # Map to the descriptive label
    return get_cluster_label(cluster_id)

df["keyword1_label"] = df["keyword1"].apply(assign_label)
df["keyword2_label"] = df["keyword2"].apply(assign_label)
df["keyword3_label"] = df["keyword3"].apply(assign_label)


In [57]:
df

Unnamed: 0,transcriptid,companyname,gvkey,transcript_date,speaker_name,presentation_len,year,full_transcript_len,promise_verbatim,promise_explain,promise_id,promise_horizon_months,keywords,keyword1,keyword2,keyword3,keyword1_label,keyword2_label,keyword3_label
0,45212.0,"Team, Inc.",10353,2010-01-06,Philip Hawk,166.0,2010,12012,We have always been and continue to be fully c...,The CEO asserts a continuous commitment to eth...,10353_45212.0_05,,"['Business Ethics', ' Corporate Integrity', ' ...",Business Ethics,Corporate Integrity,Stakeholder Transparency,Unclustered,Unclustered,Shareholder and Stakeholder Trust and Engagement
1,45248.0,"Worthington Industries, Inc.",11600,2010-01-06,John McConnell,217.0,2010,3332,But I assure you we will remain vigilant in se...,The CEO is committing that the company will ma...,11600_45248.0_01,,"['Economic Monitoring', ' Risk Management', ' ...",Economic Monitoring,Risk Management,Contingency Planning,Unclustered,Risk Management,Recovery Strategies
2,45259.0,Bed Bath & Beyond Inc.,25338,2010-01-06,Steven Temares,819.0,2010,5320,We always look for ways to enhance our custome...,The CEO is affirming a long‐term commitment to...,25338_45259.0_01,,"['Customer Experience', ' Market Leadership', ...",Customer Experience,Market Leadership,International Expansion,Experience Enhancement,Leadership,Unclustered
3,45259.0,Bed Bath & Beyond Inc.,25338,2010-01-06,Steven Temares,819.0,2010,5320,While we continue to review and prioritize our...,The CEO is committing to allocate necessary ca...,25338_45259.0_02,,"['Capital Investment', ' Store Expansion', ' I...",Capital Investment,Store Expansion,IT Enhancement,Investment,Expansion Plans,Unclustered
4,45297.0,"Acuity Brands, Inc.",146017,2010-01-06,Vernon Nagel,781.0,2010,12867,"However, as I have said before, we will defend...",The CEO is committing the company to actively ...,146017_45297.0_01,,"['Competitive Defense', ' Pricing Strategy', '...",Competitive Defense,Pricing Strategy,Market Position,Unclustered,Pricing and Rate Management,Market Positioning Strategy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74012,2875107.0,"United Rentals, Inc.",66065,2021-07-29,Matthew Flannery,1106.0,2021,6583,"This year, we've opened 19 new specialty branc...",The CEO is outlining a commitment to further e...,66065_2875107.0_01,5.0,"['Branch Expansion', ' Specialty Branches', ' ...",Branch Expansion,Specialty Branches,Growth Target,Expansion Plans,Unclustered,Long-term and multi-year targets
74013,2875128.0,"AMETEK, Inc.",1598,2021-08-03,David Zapico,1192.0,2021,7530,"Additionally, we're continuing to make key inv...","In this statement, the CEO commits to ongoing ...",1598_2875128.0_01,,"['R&D Investment', ' Organic Growth', ' Innova...",R&D Investment,Organic Growth,Innovation Pipeline,Unclustered,Unclustered,Unclustered
74014,2875370.0,Kennametal Inc.,6386,2021-05-04,Christopher Rossi,865.0,2021,6641,"For operational excellence, we continue to exe...",The CEO is communicating the company’s commitm...,6386_2875370.0_02,,"['Cost Reduction', ' Operational Efficiency', ...",Cost Reduction,Operational Efficiency,Simplification,Cost Reduction Initiatives,Efficiency,Unclustered
74015,2875419.0,"United Rentals, Inc.",66065,2021-01-28,Matthew Flannery,1247.0,2021,7315,"Finally, we said we would fulfill our responsi...",The CEO is committing to repeating the success...,66065_2875419.0_01,,"['Earnings Protection', ' Financial Performanc...",Earnings Protection,Financial Performance,Investor Returns,Profitability,Unclustered,Return Performance and Targets


In [61]:
# what percet of rows have all three keyword1_labels, keyword2_labels, and keyword3_labels as Unclustered at the same time?
len(df[(df["keyword1_label"] == "Unclustered") & (df["keyword2_label"] == "Unclustered") & (df["keyword3_label"] == "Unclustered")]) / len(df)  


0.09912587648783387

In [65]:
# choose primary keyword from keyword1_label, keyword2_label, and keyword3_label; the first non-Unclustered keyword
def choose_primary_keyword(row):
    for col in ["keyword1_label", "keyword2_label", "keyword3_label"]:
        if row[col] != "Unclustered":
            return row[col]
    return "Unclustered"
df["primary_keyword"] = df.apply(choose_primary_keyword, axis=1)

In [66]:
output_filename = "promises_with_keywords_v3_labels.csv"
df.to_csv(output_filename, index=False)
print(f"Saved labeled file to {output_filename}")


Saved labeled file to promises_with_keywords_v3_labels.csv


# Editing manually some clusters

In [65]:
## Note: this was once done on v3, on the top 50 Unclustered keywords. Now we do it for the next 100 Unclustered keywords using the v4 db
labels = pd.read_csv("promises_with_keywords_v4_labels.csv")

In [66]:
# Optional: remove extra whitespace from column names
labels.columns = labels.columns.str.strip()

# Create separate DataFrames for each keyword-label pair
df_kw1 = labels[['keyword1', 'keyword1_label']].rename(columns={'keyword1': 'keyword', 'keyword1_label': 'label'})
df_kw2 = labels[['keyword2', 'keyword2_label']].rename(columns={'keyword2': 'keyword', 'keyword2_label': 'label'})
df_kw3 = labels[['keyword3', 'keyword3_label']].rename(columns={'keyword3': 'keyword', 'keyword3_label': 'label'})

# Concatenate into one long DataFrame and drop missing values
df_keywords = pd.concat([df_kw1, df_kw2, df_kw3], ignore_index=True)
df_keywords = df_keywords.dropna(subset=['keyword', 'label'])

# Count occurrences per label-keyword pair
keyword_counts = df_keywords.groupby(['label', 'keyword']).size().reset_index(name='count')

# Calculate overall label frequency
label_counts = df_keywords.groupby('label').size().reset_index(name='total_count')
# Order labels by overall frequency (descending)
label_counts = label_counts.sort_values('total_count', ascending=False)

# Sort keyword counts by label and count descending
keyword_counts_sorted = keyword_counts.sort_values(['label', 'count'], ascending=[True, False])

# For each label (ordered by overall frequency), get the top 10 keywords
result = {}
for _, row in label_counts.iterrows():
    current_label = row['label']
    total = row['total_count']
    top_keywords = keyword_counts_sorted[keyword_counts_sorted['label'] == current_label].head(50)
    result[current_label] = (total, top_keywords[['keyword']])


# Print the results for each label
for label, (total, df) in result.items():
    print(f"Label: {label} (Total Occurrences: {total})")
    for keyword in df['keyword']:
        print(keyword)
    print("\n")

Label: Unclustered (Total Occurrences: 79172)
Debt Reduction
 Financial Stability
 Growth Investment
 Profitable Growth
 Technology Implementation
Geographic Expansion
 Inventory Management
 Portfolio Optimization
 Service Enhancement
 Balance Sheet Improvement
 Organic Growth
Cost Control
Inventory Management
 Drug Development
Operational Excellence
 Capital Deployment
 Service Quality
Price Increases
 Retail Distribution
 Operational Milestone
Price Increase
 Sustainable Growth
 Market Penetration
Employee Safety
 International Operations
 Financial Strength
 Stock Buyback
 Emissions Reduction
 Innovation Pipeline
 Corporate Governance
 Customer Retention
 Strategic Focus
 Corporate Transparency
Capital Deployment
Shareholder Returns
 Property Development
 Retail Footprint
Brand Investment
 Market Presence
 Production Increase
Profitable Growth
Real Estate Development
Customer Support
Infrastructure Investment
 Cost Control
 Customer Satisfaction
Investment Strategy
Emissions Reducti

In [67]:
import pandas as pd
import time
from openai import OpenAI
from tqdm.notebook import tqdm


In [68]:

# Initialize the OpenAI client
client = OpenAI(
    api_key="...",
)

# Assuming 'result' is already defined with your clustered data
# Extract the top 50 unclustered keywords from your result
unclustered_label = "Unclustered"
unclustered_data = result[unclustered_label][1]
top_50_unclustered = unclustered_data.head(100)

# Function to format the cluster information for the prompt
def format_cluster_info():
    formatted_text = ""
    for label, (total, df) in result.items():
        if label != "Unclustered":  # Skip unclustered data
            formatted_text += f"\n\n\n Label: {label}\n Top Keywords: \n"
            keywords = df.head(10)['keyword'].tolist()
            for keyword in keywords:
                formatted_text += f"- {keyword}\n"
            formatted_text += "\n"
    return formatted_text


# Prepare the prompt template with cluster information
cluster_info = format_cluster_info()
prompt_template = f"""
I have used clustering and some labeling methods to assign some keywords to a label (a label for a cluster of labels). I have about 127 such identified clusters using the HBDSCAN algorithm. However, some keywords were left out as unclustered. Now, I want to manually check and see if the previously unclustered label would reasonable fit under any of the clusters.

You should return just one phrase: either the name of the label, or Unclustered (in case the keyword doesn't closely fit with any other cluster.) It is totally okay to return Unclustered. That's it. Don't return any of your thinking, reasoning, any extra words or punctuations.

Here's the keyword:
{{keyword}}

Here are the cluster labels with their top 10 keywords:

{cluster_info}
"""


In [69]:
format_cluster_info()

"\n\n\n Label: Expansion Plans\n Top Keywords: \n-  Market Expansion\n- Market Expansion\n- Store Expansion\n-  International Expansion\n-  Brand Extension\n-  Geographic Expansion\n- Product Expansion\n- International Expansion\n-  Retail Expansion\n-  Product Expansion\n\n\n\n\n Label: Business Growth\n Top Keywords: \n-  Revenue Growth\n- Revenue Growth\n-  International Growth\n-  Strategic Growth\n-  Retail Growth\n-  Market Growth\n-  Geographic Growth\n-  Business Growth\n-  Sales Growth\n- Growth Strategy\n\n\n\n\n Label: Launch Announcements\n Top Keywords: \n- Product Launch\n-  Product Launch\n- Product Release\n-  Service Launch\n- Brand Launch\n- Website Launch\n-  Product Launches\n-  Market Launch\n-  Program Launch\n-  Brand Launch\n\n\n\n\n Label: Efficiency\n Top Keywords: \n-  Operational Efficiency\n-  Operational Excellence\n- Operational Efficiency\n-  Operational Improvement\n-  Productivity Improvement\n- Cost Management\n-  Cost Efficiency\n-  Capital Efficienc

In [70]:
# Process each unclustered keyword
reclassification_results = {}
errors = []

# Use regular for loop instead of tqdm to avoid the IProgress error
# The error occurs because ipywidgets is not properly installed or configured
for _, row in top_50_unclustered.iterrows():
    keyword = row['keyword']
    print("working on keyword: ", keyword)
    try:
        # Format the prompt with the current keyword
        current_prompt = prompt_template.format(keyword=keyword)
        
        # Call the OpenAI API
        response = client.chat.completions.create(
            model="o3-mini-2025-01-31",
            reasoning_effort="high",
            seed=2025,
            messages=[{"role": "user", "content": current_prompt}]
        )
        
        # Extract the suggested label
        suggested_label = response.choices[0].message.content.strip()
        print("suggested label: ", suggested_label)
        
        # Store the result
        reclassification_results[keyword] = suggested_label
        
        # Add a short delay to avoid rate limiting
        time.sleep(0.5)
        
    except Exception as e:
        print(f"Error processing keyword '{keyword}': {str(e)}")
        errors.append({"keyword": keyword, "error": str(e)})
        time.sleep(1)  # Longer delay after an error


working on keyword:  Debt Reduction
suggested label:  Cost Reduction Initiatives
working on keyword:   Financial Stability
suggested label:  Financial and Operational Discipline
working on keyword:   Growth Investment
suggested label:  Investment
working on keyword:   Profitable Growth
suggested label:  Profitability
working on keyword:   Technology Implementation
suggested label:  Emerging and advanced technologies
working on keyword:  Geographic Expansion
suggested label:  Expansion Plans
working on keyword:   Inventory Management
suggested label:  Supply Chain Transformation and Resilience
working on keyword:   Portfolio Optimization
suggested label:  Portfolio Optimization
working on keyword:   Service Enhancement
suggested label:  Improvement Commitments
working on keyword:   Balance Sheet Improvement
suggested label:  Corporate Restructuring
working on keyword:   Organic Growth
suggested label:  Business Growth
working on keyword:  Cost Control
suggested label:  Efficiency
workin

In [71]:

# Convert results to DataFrame for easier analysis
reclassification_df = pd.DataFrame(list(reclassification_results.items()), 
                                 columns=['keyword', 'suggested_label'])


In [72]:
reclassification_df

Unnamed: 0,keyword,suggested_label
0,Debt Reduction,Cost Reduction Initiatives
1,Financial Stability,Financial and Operational Discipline
2,Growth Investment,Investment
3,Profitable Growth,Profitability
4,Technology Implementation,Emerging and advanced technologies
5,Geographic Expansion,Expansion Plans
6,Inventory Management,Supply Chain Transformation and Resilience
7,Portfolio Optimization,Portfolio Optimization
8,Service Enhancement,Improvement Commitments
9,Balance Sheet Improvement,Corporate Restructuring


In [73]:
labels

Unnamed: 0,transcriptid,companyname,gvkey,transcript_date,speaker_name,presentation_len,year,full_transcript_len,promise_verbatim,promise_explain,promise_id,promise_horizon_months,keywords,keyword1,keyword2,keyword3,keyword1_label,keyword2_label,keyword3_label,primary_keyword
0,45212.0,"Team, Inc.",10353,2010-01-06,Philip Hawk,166.0,2010,12012,We have always been and continue to be fully c...,The CEO asserts a continuous commitment to eth...,10353_45212.0_05,,"['Business Ethics', ' Corporate Integrity', ' ...",Business Ethics,Corporate Integrity,Stakeholder Transparency,Unclustered,Unclustered,Shareholder and Stakeholder Trust and Engagement,Shareholder and Stakeholder Trust and Engagement
1,45248.0,"Worthington Industries, Inc.",11600,2010-01-06,John McConnell,217.0,2010,3332,But I assure you we will remain vigilant in se...,The CEO is committing that the company will ma...,11600_45248.0_01,,"['Economic Monitoring', ' Risk Management', ' ...",Economic Monitoring,Risk Management,Contingency Planning,Unclustered,Risk Management,Recovery Strategies,Risk Management
2,45259.0,Bed Bath & Beyond Inc.,25338,2010-01-06,Steven Temares,819.0,2010,5320,We always look for ways to enhance our custome...,The CEO is affirming a long‐term commitment to...,25338_45259.0_01,,"['Customer Experience', ' Market Leadership', ...",Customer Experience,Market Leadership,International Expansion,Experience Enhancement,Leadership,Expansion Plans,Experience Enhancement
3,45259.0,Bed Bath & Beyond Inc.,25338,2010-01-06,Steven Temares,819.0,2010,5320,While we continue to review and prioritize our...,The CEO is committing to allocate necessary ca...,25338_45259.0_02,,"['Capital Investment', ' Store Expansion', ' I...",Capital Investment,Store Expansion,IT Enhancement,Investment,Expansion Plans,Unclustered,Investment
4,45297.0,"Acuity Brands, Inc.",146017,2010-01-06,Vernon Nagel,781.0,2010,12867,"However, as I have said before, we will defend...",The CEO is committing the company to actively ...,146017_45297.0_01,,"['Competitive Defense', ' Pricing Strategy', '...",Competitive Defense,Pricing Strategy,Market Position,Unclustered,Pricing and Rate Management,Market Positioning Strategy,Pricing and Rate Management
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74012,2875107.0,"United Rentals, Inc.",66065,2021-07-29,Matthew Flannery,1106.0,2021,6583,"This year, we've opened 19 new specialty branc...",The CEO is outlining a commitment to further e...,66065_2875107.0_01,5.0,"['Branch Expansion', ' Specialty Branches', ' ...",Branch Expansion,Specialty Branches,Growth Target,Expansion Plans,Unclustered,Long-term and multi-year targets,Expansion Plans
74013,2875128.0,"AMETEK, Inc.",1598,2021-08-03,David Zapico,1192.0,2021,7530,"Additionally, we're continuing to make key inv...","In this statement, the CEO commits to ongoing ...",1598_2875128.0_01,,"['R&D Investment', ' Organic Growth', ' Innova...",R&D Investment,Organic Growth,Innovation Pipeline,Innovation,Unclustered,Unclustered,Innovation
74014,2875370.0,Kennametal Inc.,6386,2021-05-04,Christopher Rossi,865.0,2021,6641,"For operational excellence, we continue to exe...",The CEO is communicating the company’s commitm...,6386_2875370.0_02,,"['Cost Reduction', ' Operational Efficiency', ...",Cost Reduction,Operational Efficiency,Simplification,Cost Reduction Initiatives,Efficiency,Unclustered,Cost Reduction Initiatives
74015,2875419.0,"United Rentals, Inc.",66065,2021-01-28,Matthew Flannery,1247.0,2021,7315,"Finally, we said we would fulfill our responsi...",The CEO is committing to repeating the success...,66065_2875419.0_01,,"['Earnings Protection', ' Financial Performanc...",Earnings Protection,Financial Performance,Investor Returns,Profitability,Profitability,Return Performance and Targets,Profitability


In [74]:

# Display summary of results

reclassification_df


Unnamed: 0,keyword,suggested_label
0,Debt Reduction,Cost Reduction Initiatives
1,Financial Stability,Financial and Operational Discipline
2,Growth Investment,Investment
3,Profitable Growth,Profitability
4,Technology Implementation,Emerging and advanced technologies
5,Geographic Expansion,Expansion Plans
6,Inventory Management,Supply Chain Transformation and Resilience
7,Portfolio Optimization,Portfolio Optimization
8,Service Enhancement,Improvement Commitments
9,Balance Sheet Improvement,Corporate Restructuring


In [75]:
# make sure these labels are in the labels dataframe

# Extract all unique labels from the three label columns in the labels dataframe
unique_labels_set = set()
for col in ['keyword1_label', 'keyword2_label', 'keyword3_label']:
    unique_labels_set.update(labels[col].dropna().unique())

# Get all unique suggested labels from the reclassification dataframe
suggested_labels_set = set(reclassification_df['suggested_label'].dropna().unique())

# Check if all suggested labels exist in the unique labels set
all_suggested_labels_exist = suggested_labels_set.issubset(unique_labels_set)

print(f"Total unique labels in labels dataframe: {len(unique_labels_set)}")
print(f"Total unique suggested labels: {len(suggested_labels_set)}")


Total unique labels in labels dataframe: 118
Total unique suggested labels: 29


In [76]:

if all_suggested_labels_exist:
    print("✓ All suggested labels exist in the keyword labels.")
else:
    # Find which suggested labels don't exist
    missing_labels = suggested_labels_set - unique_labels_set
    print(f"✗ Found {len(missing_labels)} suggested labels that don't exist in the keyword labels:")
    for label in missing_labels:
        print(f"  - '{label}'")
    
    # Create a dataframe showing the keywords with missing labels
    missing_keywords_df = reclassification_df[reclassification_df['suggested_label'].isin(missing_labels)]
    print(f"\nKeywords with missing labels ({len(missing_keywords_df)} rows):")
    display(missing_keywords_df.head(10))  # Show first 10 rows with missing labels
    
    if len(missing_keywords_df) > 10:
        print(f"...and {len(missing_keywords_df) - 10} more rows")

✓ All suggested labels exist in the keyword labels.


In [77]:
# First, create a dictionary from reclassification_df for faster lookups
reclassification_dict = dict(zip(reclassification_df['keyword'], reclassification_df['suggested_label']))

# Loop through each keyword column (keyword1, keyword2, keyword3)
for i in range(1, 4):
    keyword_col = f'keyword{i}'
    label_col = f'keyword{i}_label'
    
    # Find rows where the label is "Unclustered"
    unclustered_indices = labels[labels[label_col] == "Unclustered"].index
    
    # For each unclustered keyword
    for idx in unclustered_indices:
        keyword = labels.loc[idx, keyword_col]
        
        # Check if this keyword is in the reclassification dictionary
        if keyword in reclassification_dict:
            labels.loc[idx, label_col] = reclassification_dict[keyword]


In [78]:

# Display the first few rows to confirm changes
labels.sample(6)

Unnamed: 0,transcriptid,companyname,gvkey,transcript_date,speaker_name,presentation_len,year,full_transcript_len,promise_verbatim,promise_explain,promise_id,promise_horizon_months,keywords,keyword1,keyword2,keyword3,keyword1_label,keyword2_label,keyword3_label,primary_keyword
17486,534584.0,"M/I Homes, Inc.",12615,2013-10-24,Robert Schottenstein,1258.0,2013,7456,We have also announced we’re opening in Dallas...,The CEO is promising that the company will off...,12615_534584.0_02,,"['Market Expansion', ' New Location', ' Sales ...",Market Expansion,New Location,Sales Launch,Expansion Plans,New Facility and Location Openings,Launch Announcements,Expansion Plans
20975,653000.0,Rite Aid Corporation,9155,2014-06-19,John Standley,845.0,2014,6210,"Next week, we will hold the groundbreaking -- ...",The CEO commits to hosting a grand reopening e...,9155_653000.0_02,0.5,"['Store Reopening', ' Format Evolution', ' Sto...",Store Reopening,Format Evolution,Store Remodeling,Unclustered,Business Transformation,Property Renovation and Remodeling,Business Transformation
69047,2527342.0,Viatris Inc.,7637,2022-02-28,Michael Goettler,972.0,2022,12078,And as we continue to execute against our plan...,The CEO promises an organizational transformat...,7637_2527342.0_04,,"['Operational Efficiency', ' Organizational Si...",Operational Efficiency,Organizational Simplification,Business Focus,Efficiency,Unclustered,Focus Areas,Efficiency
73856,2708926.0,Oracle Corporation,12142,2022-12-12,Safra Catz,1722.0,2022,10428,"I know you're tired of me saying it, but I wil...","Here, the CEO reaffirms his dedication to deli...",12142_2708926.0_02,,"['Shareholder Value', ' Capital Allocation', '...",Shareholder Value,Capital Allocation,Corporate Strategy,Shareholder and Stakeholder Trust and Engagement,Financial and Operational Discipline,Corporate Strategic Initiatives,Shareholder and Stakeholder Trust and Engagement
25325,793245.0,Invacare Corporation,6158,2015-04-23,Matthew Monaghan,436.0,2015,5466,"Over the next 90 to 100 days, I will be gettin...",The CEO promises to spend the next 90 to 100 d...,6158_793245.0_02,3.5,"['Business Assessment', ' Regulatory Complianc...",Business Assessment,Regulatory Compliance,Financial Improvement,Unclustered,Regulatory and Legal Processes,Unclustered,Regulatory and Legal Processes
45631,1592019.0,Mastercard Incorporated,160225,2018-10-30,Ajay Banga,1931.0,2018,12259,"In Germany, for example, we will work with Pay...",The CEO commits to collaborating with PayPal i...,160225_1592019.0_07,,"['Digital Payments', ' PayPal Partnership', ' ...",Digital Payments,PayPal Partnership,Contactless Technology,Integrated Digital Payment Solutions,Partnerships,Integrated Digital Payment Solutions,Integrated Digital Payment Solutions


In [79]:
# choose primary keyword from keyword1_label, keyword2_label, and keyword3_label; the first non-Unclustered keyword
def choose_primary_keyword(row):
    for col in ["keyword1_label", "keyword2_label", "keyword3_label"]:
        if row[col] != "Unclustered":
            return row[col]
    return "Unclustered"
labels["primary_keyword"] = labels.apply(choose_primary_keyword, axis=1)

In [80]:
output_filename = "promises_with_keywords_v5_labels.csv"
labels.to_csv(output_filename, index=False)
print(f"Saved labeled file to {output_filename}")

Saved labeled file to promises_with_keywords_v5_labels.csv
