In [None]:
import pandas as pd
import numpy as np
import os

# Broken Promises Count DB

In [None]:
# %% [markdown]
# # Load and prepare data (unchanged)

# %%
import pandas as pd
promises = pd.read_csv("../data/sxp1500_presentations_ceo_aggregated_promises_expanded_cleaned_transcriptlevel_horizon_specificity.csv")
# create promise_id column, it is gvkey_transcriptid_2digitnumber (01, 02, 03, ...)
promises['promise_id'] = promises.groupby(['gvkey', 'transcriptid']).cumcount() + 1
promises['promise_id'] = promises['gvkey'].astype(str) + '_' + promises['transcriptid'].astype(str) + '_' + promises['promise_id'].apply(lambda x: f'{x:02d}')
promises_select = promises[['gvkey', 'mostimportantdateutc', 'companyname', 'exec_fullname', 'execid', 'promise_id','1-promise-verbatim' ,'2-promise-explain' ,'3-promise-horizon-v2', 'specificity_score']].sort_values(by=['gvkey', 'mostimportantdateutc',])

labels = pd.read_csv("promises_with_keywords_v5_labels.csv")
def revert_promise_id(promise_id):
    parts = promise_id.split('_')
    fixed_parts = []
    for part in parts:
        if part.endswith('.0'):
            part = str(int(float(part)))
        fixed_parts.append(part)
    return '_'.join(fixed_parts)

labels['promise_id'] = labels['promise_id'].apply(revert_promise_id)

labels = labels[['promise_id', 'primary_keyword']]

# merge promises and labels on promise_id
promises_select = pd.merge(labels, promises_select, on=['promise_id'], how='left')

# %%
promises_select.head(5)


In [None]:
promises_select_10percent_results = pd.read_csv("promises_select_10percent_results.csv")


In [None]:
# merge with promises_select_10percent on promise_id
promises_select_10percent_results_merged = pd.merge(promises_select_10percent_results, promises_select, on=['promise_id'], how='left')

# %%

promises_select_10percent_results_merged.head()


In [None]:
promises_select_10percent_results_merged.status_code.value_counts()

In [None]:
# 
df = promises_select_10percent_results_merged.copy()

df['date'] = pd.to_datetime(df['mostimportantdateutc'], errors='coerce')
df['year'] = df['date'].dt.year
df['is_broken'] = df['status_code'].isin(['DELAYED', 'NOT_DELIVERED']).astype(int)

# per exec-year: broken count and total promises
by_year = (df.groupby(['execid', 'year'])
             .agg(broken_count=('is_broken', 'sum'),
                  promises_in_year=('execid', 'size'))
             .reset_index())

# cumulative *prior* years
by_year = by_year.sort_values(['execid', 'year'])
by_year['no_broken_promises'] = (
    by_year.groupby('execid')['broken_count'].cumsum().shift(1, fill_value=0).astype(int)
)
by_year['no_promises_prior'] = (
    by_year.groupby('execid')['promises_in_year'].cumsum().shift(1, fill_value=0).astype(int)
)

# --- Rolling windows over prior calendar years (2, 3, 5) ---
# We compute on a full year grid per execid so missing years contribute 0,
# then merge back to the original rows (execid, year).
roll_list = []
for eid, g in by_year.groupby('execid', sort=False):
    # full calendar grid for this exec
    yr_min, yr_max = int(g['year'].min()), int(g['year'].max())
    idx = pd.RangeIndex(yr_min, yr_max + 1, 1, name='year')

    tmp = (g.set_index('year')[['broken_count', 'promises_in_year']]
             .reindex(idx, fill_value=0))

    # exclude current year using shift(1); window counts previous N years only
    for w in (2, 3, 5):
        tmp[f'no_broken_promises_roll{w}'] = (
            tmp['broken_count'].rolling(window=w, min_periods=1).sum()
            .shift(1, fill_value=0)
        ).astype(int)
        tmp[f'no_promises_prior_roll{w}'] = (
            tmp['promises_in_year'].rolling(window=w, min_periods=1).sum()
            .shift(1, fill_value=0)
        ).astype(int)

    tmp['execid'] = eid
    roll_list.append(tmp.reset_index()[['execid', 'year',
        'no_broken_promises_roll2', 'no_broken_promises_roll3', 'no_broken_promises_roll5',
        'no_promises_prior_roll2', 'no_promises_prior_roll3', 'no_promises_prior_roll5']])

roll_df = pd.concat(roll_list, ignore_index=True)

# merge rolling features back to the compact per-year table
result = (by_year.merge(roll_df, on=['execid', 'year'], how='left')
                 [['execid', 'year',
                   'no_broken_promises', 'no_promises_prior', 'promises_in_year',
                   'no_broken_promises_roll2', 'no_broken_promises_roll3', 'no_broken_promises_roll5',
                   'no_promises_prior_roll2', 'no_promises_prior_roll3', 'no_promises_prior_roll5']])

# result is your final table


# Excecucomp

In [None]:
# CEO data

previously_downloaded_execucomp = True
if previously_downloaded_execucomp == False:
        
        wrds_username = os.getenv('WRDS_USERNAME')
        wrds_password = os.getenv('WRDS_PASSWORD')
        db_wrds = wrds.Connection(wrds_username=wrds_username, wrds_password=wrds_password)
    
        # replace with wrdssec.forms
        query = f"""
            SELECT *
            FROM  comp_execucomp.anncomp
            WHERE ceoann='CEO'
        """
        
        execucomp = db_wrds.raw_sql(query)
        execucomp['gvkey'] = execucomp['gvkey'].astype(int)

        execucomp.to_csv('../data/execucomp.csv')
        db_wrds.close()

else:
    execucomp = pd.read_csv('../data/execucomp.csv')
    

In [None]:
# whether the title collumn contains chmn or chairman
execucomp['ceo_dual'] = execucomp['title'].str.contains('chmn|chairman', case=False)

In [None]:
execucomp['ceo_director'] = execucomp['execdir']

In [None]:
execucomp['ceo_options_compensation'] = execucomp['opt_unex_exer_est_val']/execucomp['salary']
execucomp['ceo_options_compensation_log'] = np.log(execucomp['ceo_options_compensation'])

In [None]:
execucomp['ceo_option_awards'] = execucomp['option_awards_blk_value']/execucomp['salary']
execucomp['ceo_option_awards_log'] = np.log(execucomp['ceo_option_awards'])

In [None]:
execucomp['ceo_gender'] = execucomp['gender']

In [None]:
# gender dummy
execucomp['ceo_gender_dummy'] = execucomp['ceo_gender'].apply(lambda x: 1 if x=='MALE' else 0)

In [None]:
execucomp['ceo_age'] = execucomp['age']

In [None]:
# First, make sure your DataFrame is sorted
execucomp = execucomp.sort_values(by=['gvkey', 'execid', 'year'])

# Create a group identifier for consecutive years
execucomp['year_group'] = execucomp.groupby(['gvkey', 'execid'])['year'].diff().ne(1).cumsum()

# Create the ceo_tenure variable
execucomp['ceo_tenure'] = execucomp.groupby(['gvkey', 'execid', 'year_group']).cumcount() + 1

# Drop the temporary 'year_group' column
execucomp.drop(columns=['year_group'], inplace=True)

In [None]:
execucomp['ceo_total_compensation'] = execucomp['salary'] + execucomp['bonus'] + execucomp['othcomp']
execucomp['ceo_total_compensation_log'] = np.log(execucomp['ceo_total_compensation'])

execucomp['ceo_total_deferred_compensation'] = execucomp['defer_balance_tot'] + execucomp['defer_contrib_co_tot'] + execucomp['defer_contrib_exec_tot']
execucomp['ceo_total_deferred_compensation_log'] = np.log(execucomp['ceo_total_deferred_compensation'])

execucomp['ceo_total_shares_owned'] = execucomp['shrown_tot']
execucomp['ceo_total_shares_owned_log'] = np.log(execucomp['ceo_total_shares_owned'])



# Compustat Quarterly

In [None]:
previously_downloaded_compustat_quarterly = True
if previously_downloaded_compustat_quarterly == False:
    db = wrds.Connection(wrds_username=wrds_username, wrds_password=wrds_password)
    # download finratiofirm table
    query = """
    SELECT gvkey, datadate, indfmt, consol, popsrc, datafmt, fyr, actq, atq, ibq, niq, niy, epsfi12, oeps12, epsfxy, mkvaltq, prccq, prchq, prclq, saleq, cshoq, actq, lctq, xoprq, xrdq, intanq, txdbq, atq, dpq, aqpq, dlttq, dlcq, seqq
    FROM comp_na_daily_all.fundq
    WHERE fyearq >= 2003
    """
    compustat = db.raw_sql(query)
    compustat.to_pickle('../data/compustat_q.pkl')

else:
    # Load the ratio data from the pickle file
    compustat = pd.read_pickle('../data/compustat_q.pkl')


In [None]:

compustat=compustat[compustat.datafmt=='STD']
compustat=compustat[compustat.popsrc=='D']
compustat=compustat[compustat.consol=='C']
compustat=compustat[compustat.indfmt== 'INDL']

compustat['datadate'] = pd.to_datetime(compustat['datadate'], format='%Y%m%d')
compustat['month']=compustat['datadate'].dt.month
compustat['year']=compustat['datadate'].dt.year
compustat.drop_duplicates(['gvkey','datadate'], inplace=True)
compustat['gvkey'] = compustat['gvkey'].astype(int)
compustat



In [None]:
# EPA
compustat['EPS'] = np.where(compustat['cshoq'] != 0, compustat['niq'] / compustat['cshoq'], np.nan)

# roa
compustat['roa'] = np.where(compustat['atq'] != 0, compustat['niq'] / compustat['atq'], np.nan)

# roe
compustat['roe'] = np.where((compustat['prccq']* compustat['cshoq']) != 0, compustat['niq'] / (compustat['prccq']* compustat['cshoq']), np.nan)


# r&d share of total expense
compustat['rd_f'] = compustat['xrdq'] / compustat['xoprq'].where(compustat['xoprq'] != 0, np.nan)
compustat['rd_f'] = compustat['rd_f'].fillna(0)

# recognized intangible assets as part of total assets
compustat['intang_f'] = compustat['intanq'] / compustat['atq'].where(compustat['atq'] != 0, np.nan)
compustat['intang_f'] = compustat['intang_f'].fillna(0)

# depreciation as part of total assets
compustat['dpt_f'] = compustat['dpq'] / compustat['atq'].where(compustat['atq'] != 0, np.nan)
compustat['dpt_f'] = compustat['dpt_f'].fillna(0)

#Total merger
compustat['mergers'] = compustat['aqpq'].fillna(0)

# Leverage
compustat['leverage'] = (compustat['dlttq'] + compustat['dlcq']) / compustat['seqq']
compustat.loc[compustat['seqq'] == 0, 'leverage'] = np.nan



# 2.1: Cash flow (CF) = ibq + dpq
compustat['cf'] = compustat['ibq'] + compustat['dpq']


# 2.2: Market Value of Equity (MVE) = cshoq * prccq
compustat['mve'] = compustat['cshoq'] * compustat['prccq']

# 2.3: Book Debt 
#      A common approximation is atq - seqq - txdbq.
compustat['book_debt'] = compustat['atq'] - compustat['seqq']
compustat['book_debt'] = compustat['book_debt'] - compustat['txdbq'].fillna(0)

# 2.4: Tobin's Q = (MVE + Book Debt) / atq
compustat['tobin_q'] = (compustat['mve'] + compustat['book_debt']) / compustat['atq']
# earnings_volatility
compustat.sort_values(['gvkey', 'datadate'], inplace=True)
compustat['earnings_volatility'] = compustat.groupby(['gvkey'])['roa'].transform(lambda x: x.rolling(6).std())
from scipy.stats.mstats import winsorize

# Winsorize the variables at the 1% level
compustat['EPS'] = winsorize(compustat['EPS'], limits=[0.025, 0.025])
compustat['roa'] = winsorize(compustat['roa'], limits=[0.025, 0.025])
compustat['roe'] = winsorize(compustat['roe'], limits=[0.025, 0.025])
compustat['rd_f'] = winsorize(compustat['rd_f'], limits=[0.025, 0.025])
compustat['intang_f'] = winsorize(compustat['intang_f'], limits=[0.025, 0.025])
compustat['dpt_f'] = winsorize(compustat['dpt_f'], limits=[0.025, 0.025])
compustat['mergers'] = winsorize(compustat['mergers'], limits=[0.025, 0.025])
compustat['leverage'] = winsorize(compustat['leverage'], limits=[0.025, 0.025])
compustat['earnings_volatility'] = winsorize(compustat['earnings_volatility'], limits=[0.025, 0.025])
compustat['tobin_q'] = winsorize(compustat['tobin_q'], limits=[0.025, 0.025])


# Replace NaN values with 0
compustat = compustat.fillna(0)
compustat['roe'].describe(percentiles=[.01, .05, .1, .25, .5, .75, .9, .95, .99])


# Compustat Annual

In [None]:
previously_downloaded_compustat_annual = True

if previously_downloaded_compustat_annual == False:
    db = wrds.Connection(wrds_username=wrds_username, wrds_password=wrds_password)
    # download finratiofirm table
    query = """
    SELECT gvkey, datadate, datafmt, popsrc, consol, indfmt, cusip, cik,  sich,  naicsh, sale, ppent, emp, xrd, xad
    FROM comp_na_daily_all.funda
    WHERE fyear >= 2000
    """
    compustata = db.raw_sql(query)
    compustata.to_pickle('../data/compustat_a.pkl')

else:
    # Load the ratio data from the pickle file
    compustata = pd.read_pickle('../data/compustat_a.pkl')
    
compustata=compustata[compustata.datafmt=='STD']
compustata=compustata[compustata.popsrc=='D']
compustata=compustata[compustata.consol=='C']
compustata=compustata[compustata.indfmt== 'INDL']

compustata.drop_duplicates(['gvkey','datadate'], inplace=True)

compustata['datadate'] = pd.to_datetime(compustata['datadate'], format='%Y%m%d')

compustata['month']=compustata['datadate'].dt.month
compustata['year']=compustata['datadate'].dt.year

compustata['sich'] = compustata['sich'].replace('', np.nan)
compustata = compustata.loc[compustata['sich'].notna()]
compustata.loc[:, 'sich4'] = compustata['sich'].astype(int).astype(str).str.pad(width=4, side='right', fillchar='0').astype(int)
compustata.loc[:, 'sich3'] = compustata['sich4'] // 10
compustata.loc[:, 'sich2'] = compustata['sich4'] // 100

# Uniqueness Var

In [None]:
previously_downloaded_segments = True
if previously_downloaded_segments == False:

    wrds_username = os.getenv('WRDS_USERNAME')
    wrds_password = os.getenv('WRDS_PASSWORD')
    db = wrds.Connection(wrds_username=wrds_username, wrds_password=wrds_password)

    # replace with wrdssec.forms
    query = f"""
        SELECT gvkey, srcdate, datadate, sid, sales, stype, sics1
        FROM comp_segments_hist_daily.wrds_segmerged
    """
    
    segments = db.raw_sql(query)
    segments.to_csv('../data/compustat_segments.csv')
    db.close()
    
else:
    segments = pd.read_csv('../data/compustat_segments.csv')
segments.stype.value_counts()
segments.drop_duplicates(['gvkey', 'datadate','srcdate'])
segments.drop_duplicates(['gvkey', 'datadate', 'sid'])
segments = segments[['gvkey', 'datadate', 'sid', 'sales', 'stype']]
segments['gvkey'] = segments['gvkey'].astype(int)
segments_n = segments[(segments['stype']=='BUSSEG') | (segments['stype']=='OPSEG')].groupby(['gvkey', 'datadate'])['sid'].nunique().reset_index()
segments_n = segments_n.rename(columns={'sid': 'n_segments'})

segments_n['year'] = segments_n['datadate'].astype(str).str.slice(0,4).astype(int)
segments_n = segments_n[['gvkey', 'year', 'n_segments']]

compustata['gvkey'] = compustata['gvkey'].astype(int)
compustata = compustata.merge(segments_n, on=['gvkey', 'year'], how='left',
                                  suffixes=('_df1', ''))
compustata['n_segments'].fillna(1,inplace=True)
## Strategy Uniqueness using Litov et al. (2012) method
seg_ind = 'sics1'

segments = pd.read_csv('../data/compustat_segments.csv')
segments['year'] = segments['datadate'].str.slice(0,4).astype(int)
segments = segments[(segments['stype'] == 'BUSSEG') | (segments['stype'] == 'OPSEG')]
segments = segments[['gvkey', 'year', 'sales', seg_ind ]]
segments = segments[segments['sales'] > 0]
segments = segments[segments[seg_ind].notnull()]
segments = segments[segments[seg_ind] != '']
segments = segments[segments[seg_ind] != 0]
segments[seg_ind] = segments[seg_ind].astype(int)
segments['year'] = segments['year'].astype(int)
segments['gvkey'] = segments['gvkey'].astype(int)
segments = segments.rename(columns={'gvkey': 'GVKEY'})
segments = segments.rename(columns={seg_ind: 'segment_sic'})
segments = segments.rename(columns={'sales': 'segment_sale'})
segments = segments.groupby(['GVKEY', 'year', 'segment_sic'])['segment_sale'].sum().reset_index(name='segment_sale')
segments['segment_sic'] = segments['segment_sic'].astype(int)

# Step 1
idx = segments.groupby(['GVKEY', 'year'])['segment_sale'].idxmax()
segments['primary_sic'] = segments.loc[idx, 'segment_sic']
segments['primary_sic'] = segments.groupby(['GVKEY', 'year'])['primary_sic'].transform('max')

# Step 2
total_sales = segments.groupby(['GVKEY', 'year'])['segment_sale'].transform('sum')
segments['norm_sale'] = segments['segment_sale'] / total_sales

# Step 3
firm_year_matrix = segments.pivot_table(index=['GVKEY', 'year', 'primary_sic'],
                                        columns='segment_sic',
                                        values='norm_sale').fillna(0)

# Step 4
actual_sales_matrix = segments.pivot_table(index=['GVKEY', 'year', 'primary_sic'],
                                           columns='segment_sic',
                                           values='segment_sale').fillna(0)

industry_year_sales = actual_sales_matrix.groupby(['primary_sic', 'year']).sum()

# Step 5
total_industry_sales = industry_year_sales.sum(axis=1)
norm_industry_year_sales = industry_year_sales.div(total_industry_sales, axis=0)

# Step 6
diff_matrix = firm_year_matrix.subtract(norm_industry_year_sales, axis=1)

# step 7: sum of squared differences
squared_diff_matrix = diff_matrix ** 2
sum_squared_diff = squared_diff_matrix.sum(axis=1)

uniqueness = sum_squared_diff.reset_index(name='strategy_unique')
uniqueness = uniqueness.rename(columns={'GVKEY': 'gvkey'})
uniqueness.drop_duplicates(['gvkey', 'year'], inplace=True)


# BoardEx


In [None]:
boardex = pd.read_stata("../data/NA - BoardEx - Organization Summary - Analytics.dta")

In [None]:
boardex.columns

In [None]:
boardex = boardex[['CompanyID', 'AnnualReportDate', 'Succession', 'Attrition',
       'GenderRatio', 'NationalityMix', 'NumberDirectors', 'STDEVTimeBrd',
       'STDEVTimeInCo', 'STDEVTotNoLstdBrd', 'STDEVTotCurrNoLstdBrd',
       'STDEVNoQuals', 'STDEVAge', 'NetworkSize',]]

# drop duplciates for companyid and annualreportdate
boardex = boardex.drop_duplicates(['CompanyID', 'AnnualReportDate'])


In [None]:
boardex_map = pd.read_stata("../data/NA - BoardEx_linking_map.dta")

# keep only if preferred ==1
boardex_map = boardex_map[boardex_map['preferred'] == 1]

# keep only compnayid and gvkey
boardex_map = boardex_map[['companyid', 'GVKEY']]

# drop duplicates
boardex_map = boardex_map.drop_duplicates(['companyid', 'GVKEY'])

# rename companyid to CompanyID and GVKEY to gvkey
boardex_map = boardex_map.rename(columns={'companyid': 'CompanyID', 'GVKEY': 'gvkey'})



In [None]:
# merge boardex with boardex_map on CompanyID
boardex = boardex.merge(boardex_map, left_on='CompanyID', right_on='CompanyID', how='left')


In [None]:
# keep if gvkey is not null
boardex = boardex[boardex['gvkey'].notna()]

In [None]:
# convert AnnualReportDate to datetime
boardex['AnnualReportDate'] = pd.to_datetime(boardex['AnnualReportDate'])

boardex['year_b'] = boardex['AnnualReportDate'].dt.year


In [None]:
# drop duplicates gvkey and year
boardex = boardex.drop_duplicates(['gvkey', 'year_b'])


# Merges

## Merge Compustata with uniqueness

In [None]:
# gvkey to int
uniqueness['gvkey'] = uniqueness['gvkey'].astype(int)
compustata['gvkey'] = compustata['gvkey'].astype(int)
compustata = compustata.merge(uniqueness, on=['gvkey', 'year'], how='left')

## Merge compustat with compustata

In [None]:
compustata['gvkey'] = compustata['gvkey'].astype(int)

In [None]:
compustata_merged = compustata.merge(compustat, on=['gvkey', 'datadate'], how='inner', suffixes=('_a', '_q'))

In [None]:
compustata_merged.drop_duplicates(['gvkey','datadate'], inplace=True)


In [None]:
compustata_merged.columns

In [None]:
# create fyear column which is year_a if fyr is 6 or larger, otherwise it is year_a-1
compustata_merged['fyear'] = np.where(compustata_merged['fyr'] >= 6, compustata_merged['year_a'], compustata_merged['year_a'] - 1)

## Merge boardex and Compustata

In [None]:
# merge boardex and compustata_merged on gvkey and year (year_b for boardex, year_a for compustata_merged)
boardex['year_b'] = boardex['year_b'].astype(int)
boardex['gvkey'] = boardex['gvkey'].astype(int)
compustata_merged = compustata_merged.merge(boardex, left_on=['gvkey', 'year_a'], right_on=['gvkey', 'year_b'], how='left')


## Merge execucomp with compustata_merged

In [None]:
# merge execucomp with compustata_merged on gvkey and year = fyear
execucomp_merged = execucomp.merge(compustata_merged, left_on=['gvkey', 'year'], right_on=['gvkey', 'fyear'], how='inner', suffixes=('_e', '_c'))

## Merge ceo_dismissal with execucomp_merged

In [None]:
# open /Users/majid/Dropbox/Promises/transcripts_wrds/data/CEO Dismissal Database Posted to Web 9Nov23.xlsx

dismissal_data = pd.read_excel("../data/CEO Dismissal Database Posted to Web 9Nov23.xlsx")

In [None]:
# merge execucomp  [gvkey, year, co_per_rol] with dismissal_data on [gvkey, fyear, co_per_rol]
ceo_dismissal = execucomp_merged.merge(dismissal_data, left_on=['gvkey', 'year', 'co_per_rol'], right_on=['gvkey', 'fyear', 'co_per_rol'], how='left', suffixes=('_e', '_d'))


In [None]:
# var involuntary_dismissal = 1 if departure_code is 3 or 4
ceo_dismissal['involuntary_dismissal'] = ceo_dismissal['departure_code'].isin([3, 4])



In [None]:
ceo_dismissal['involuntary_dismissal'].value_counts()

In [None]:
# merge ceo_dismissal on execid, year with result on execid, year
ceo_dismissal = ceo_dismissal.merge(result, on=['execid', 'year'], how='left')


In [None]:
# limit to execid s that show in results
ceo_dismissal_results = ceo_dismissal[ceo_dismissal['execid'].isin(result['execid'])]

In [None]:
result['execid'].nunique()

In [None]:
ceo_dismissal_results['execid'].nunique()

In [None]:
# replace no_broken_promises with 0 if missing
ceo_dismissal_results['no_broken_promises'] = ceo_dismissal_results['no_broken_promises'].fillna(0)



In [None]:
ceo_dismissal_results['no_broken_promises'].describe()

In [None]:
# describe no_promises_prior    

ceo_dismissal_results['no_promises_prior'] = ceo_dismissal_results['no_promises_prior'].fillna(0)



In [None]:
ceo_dismissal_results['no_promises_prior'].describe()

In [None]:
ceo_dismissal_results['involuntary_dismissal'].value_counts()

In [None]:
# group by gvkey year, keep only duplicates
ceo_dismissal_results[ceo_dismissal_results.duplicated(subset=['gvkey', 'year'], keep=False)]

In [None]:
# drop gvkey, year duplicates
ceo_dismissal_results = ceo_dismissal_results.drop_duplicates(subset=['gvkey', 'year'], keep='first')


In [None]:
list(ceo_dismissal_results.columns)

In [None]:
# save to stata - handle infinity values and column name issues
ceo_dismissal_results_clean = ceo_dismissal_results.copy()

# Replace infinity values with NaN for all numeric columns
numeric_cols = ceo_dismissal_results_clean.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    ceo_dismissal_results_clean[col] = ceo_dismissal_results_clean[col].replace([np.inf, -np.inf], np.nan)

# Handle object columns that might cause issues
object_cols = ceo_dismissal_results_clean.select_dtypes(include=['object']).columns
for col in object_cols:
    # Convert problematic object columns to string or drop if all null
    if ceo_dismissal_results_clean[col].isna().all():
        ceo_dismissal_results_clean = ceo_dismissal_results_clean.drop(columns=[col])
        print(f"Dropped column {col} - all null values")
    else:
        # Convert to string and handle None values
        ceo_dismissal_results_clean[col] = ceo_dismissal_results_clean[col].astype(str)
        ceo_dismissal_results_clean[col] = ceo_dismissal_results_clean[col].replace('nan', '')
        ceo_dismissal_results_clean[col] = ceo_dismissal_results_clean[col].replace('None', '')
        # Handle unicode characters that cause encoding issues
        ceo_dismissal_results_clean[col] = ceo_dismissal_results_clean[col].str.encode('ascii', errors='ignore').str.decode('ascii')
        
        # Truncate strings that are too long for Stata (even v117 has a limit of 2045 chars)
        max_length = 2000  # Leave some buffer
        if col in ceo_dismissal_results_clean.columns:
            ceo_dismissal_results_clean[col] = ceo_dismissal_results_clean[col].str[:max_length]

# Fix column names for Stata compatibility (max 32 chars, alphanumeric + underscore only)
column_mapping = {}
for col in ceo_dismissal_results_clean.columns:
    new_col = col
    # Remove spaces and special characters
    new_col = ''.join(c if c.isalnum() or c == '_' else '_' for c in new_col)
    # Truncate to 32 characters
    if len(new_col) > 32:
        new_col = new_col[:32]
    # Ensure it doesn't start with a number
    if new_col[0].isdigit():
        new_col = 'var_' + new_col[:28]
    column_mapping[col] = new_col

ceo_dismissal_results_clean = ceo_dismissal_results_clean.rename(columns=column_mapping)

# Save to stata with version 117 (Stata 13+) to handle longer strings
try:
    ceo_dismissal_results_clean.to_stata('../../data/ceo_dismissal_reg_data.dta', 
                                          write_index=False,
                                          version=117)
    print("Successfully saved to Stata format (version 117)")
except Exception as e:
    print(f"Error saving to Stata: {e}")
    # Alternative: save as CSV
    ceo_dismissal_results_clean.to_csv('../../data/ceo_dismissal_reg_data.csv', index=False)
    print("Saved as CSV instead")

In [None]:
# how many unique execid
ceo_dismissal_results_clean['gvkey'].nunique()

# Some Random Tests

In [None]:
# keep if the no_promises_prior_roll5 is above 50
batch_all_50 = ceo_ranking_merged[ceo_ranking_merged['no_promises_prior_roll5'] > 50]
# create column year in promises_select_10percent_results_merged based on mostimportantdateutc

high_promises = promises_select_10percent_results_merged[promises_select_10percent_results_merged['gvkey'] == 25434]

#