In [None]:
import pandas as pd
import numpy as np


In [None]:

# Preparing for visualization
import matplotlib.pyplot as plt
# Set the font to Times New Roman
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Times New Roman'

import seaborn as sns
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

In [None]:

promises = pd.read_pickle("../data/sxp1500_presentations_ceo_aggregated_promises_expanded.pkl")

# Some cleaning up of the data

In [None]:
promises['promise_id'] = promises.groupby(['gvkey', 'transcriptid']).cumcount() + 1
promises['promise_id'] = promises['gvkey'].astype(str) + '_' + promises['transcriptid'].astype(str) + '_' + promises['promise_id'].apply(lambda x: f'{x:02d}')

In [None]:
# remove the word 'months' from the promise horizon column
promises['4-promise-horizon'] = promises['4-promise-horizon'].str.replace('months', '')
# remove the '+' from the promise horizon column
promises['4-promise-horizon'] = promises['4-promise-horizon'].str.replace('+', '')
# if it contains 'unclear' or "Unclear" in the promise, then set the promise horizon to 'unclear'
promises.loc[promises['4-promise-horizon'].str.contains('unclear', case=False, na=False), '4-promise-horizon'] = 'unclear'

def process_value(value):
    if pd.isna(value):
        return np.nan
    if value == 'unclear':
        return np.nan
    if '-' in value:
        try:
            number1, number2 = value.split('-')
            return (float(number1) + float(number2)) / 2
        except ValueError:
            return np.nan
    try:
        return float(value)
    except ValueError:
        return np.nan

promises['promise_horizon_months'] = promises['4-promise-horizon'].apply(process_value)


# Select promises to keep

In [None]:
promises = promises[((promises['7-is-promise'] == 'yes') | (promises['7-is-promise'] == 'Yes')) &
                    ((promises['8-financial-guidance'] == 'no') | (promises['8-financial-guidance'] == 'No')) & 
                    ((promises['5-commitment-degree'] == 'strong-commitment'))]


# Select columns

In [None]:
promises.columns

In [None]:
promises = promises[['transcriptid', 'companyname',  'gvkey', 'mostimportantdateutc',
       'transcriptpersonname', 'word_count', 'year', 'transcript_text_len',
       '1-promise-verbatim', '2-promise-explain', 'promise_id',
       'promise_horizon_months']]

In [None]:
#rename columns to make more intuitive
promises = promises.rename(columns={'1-promise-verbatim': 'promise_verbatim',
                                    '2-promise-explain': 'promise_explain',
                                    'mostimportantdateutc': 'transcript_date',
                                    'transcriptpersonname': 'speaker_name',
                                    'transcript_text_len': 'full_transcript_len',
                                    'word_count': 'presentation_len'})

In [None]:
promises.to_csv('../data/sxp1500_presentations_ceo_aggregated_promises_expanded_cleaned.csv', index=False)

In [None]:
# drop promise_verbatim and promise_explain columns
promises_notext = promises.drop(columns=['promise_verbatim', 'promise_explain'])
promises_notext.to_csv('../data/sxp1500_presentations_ceo_aggregated_promises_expanded_cleaned_notext.csv', index=False)

In [None]:
promises.promise_id.nunique()  

### Adding transcripts as baseline

In [None]:
transcripts = pd.read_pickle("../data/sxp1500_presentations_ceo_aggregated.pkl")

In [None]:
transcripts = transcripts.drop_duplicates(subset=['transcriptid'], keep='first')

In [None]:
transcripts = transcripts[['transcriptid', 'mostimportantdateutc','gvkey', 'companyname', 'transcriptpersonname' ]]

In [None]:
transcripts = transcripts.rename(columns={'mostimportantdateutc': 'transcript_date',
                                          'transcriptpersonname': 'speaker_name',})

In [None]:
promises_agg = promises.groupby(['transcriptid']).agg({'promise_id': 'count'}).reset_index()

In [None]:
transcripts = pd.merge(transcripts, promises_agg, on='transcriptid', how='left')

In [None]:
transcripts.fillna(0, inplace=True)

In [None]:
transcripts['promise_count'] = transcripts['promise_id'].astype(int)

In [None]:
transcripts = transcripts.drop(columns=['promise_id'])

In [None]:
# Grouping data by CEO and calculating the average number of promises per quarter
transcripts['transcript_date'] = pd.to_datetime(transcripts['transcript_date'])

transcripts['year'] = transcripts['transcript_date'].dt.to_period('Y')

In [None]:
transcripts.to_csv('../data/sxp1500_presentations_ceo_aggregated_promises_expanded_cleaned_transcriptlevel.csv', index=False)

In [None]:
transcripts

# Descriptives and graphs

In [None]:
data = promises.copy()

In [None]:
# Let's start with the analysis as requested. We will perform two main analyses:
# 1. Calculate the average number of promises per transcript in each year.
# 2. Calculate the average number of promises per 1000 words spoken in the presentation for each year.

# Grouping the data by year and calculating the average number of promises per transcript
avg_promises_per_transcript = transcripts.groupby('year').apply(lambda x: x['promise_count'].sum() / x['transcriptid'].nunique())


# Adjusting the graphs to show the bars from slightly below the lowest value to slightly above the highest value.

# Determining the range for the y-axis for the first graph (Average Number of Promises per Transcript by Year)
min_val_transcript = avg_promises_per_transcript.min() * 0.9
max_val_transcript = avg_promises_per_transcript.max() * 1.1


# Plotting the adjusted graphs
plt.figure(figsize=(12, 6))

# Adjusted plot for Average Number of Promises per Transcript by Year
plt.subplot(1, 1, 1)
avg_promises_per_transcript.plot(kind='bar')
plt.title('Average Number of Promises per Transcript by Year')
plt.xlabel('Year')
plt.ylabel('Average Promises per Transcript')
plt.ylim([min_val_transcript, max_val_transcript])

plt.savefig("../data/figures/number_promises_overyears_v10.png", dpi=900)

plt.tight_layout()
plt.show()

