In [4]:
import pandas as pd
import numpy as np

In [5]:
# Load data from parquet file
df_tmp = pd.read_parquet('./data/Results/df_train_new.parquet')

In [6]:
ratio_chars = ['CAPEI', 'bm',
       'evm', 'pe_exi', 'pe_inc', 'ps', 'pcf',
       'dpr', 'npm', 'opmbd', 'opmad', 'gpm', 'ptpm', 'cfm', 'roa', 'roe',
       'roce', 'efftax', 'aftret_eq', 'aftret_invcapx', 'aftret_equity',
       'pretret_noa', 'pretret_earnat', 'GProf', 'equity_invcap',
       'debt_invcap', 'totdebt_invcap', 'capital_ratio', 'int_debt',
       'int_totdebt', 'cash_lt', 'invt_act', 'rect_act', 'debt_at',
       'debt_ebitda', 'short_debt', 'curr_debt', 'lt_debt', 'profit_lct',
       'ocf_lct', 'cash_debt', 'fcf_ocf', 'lt_ppent', 'dltt_be', 'debt_assets',
       'debt_capital', 'de_ratio', 'intcov', 'intcov_ratio', 'cash_ratio',
       'quick_ratio', 'curr_ratio', 'cash_conversion', 'inv_turn', 'at_turn',
       'rect_turn', 'pay_turn', 'sale_invcap', 'sale_equity', 'sale_nwc',
       'rd_sale', 'adv_sale', 'staff_sale', 'accrual', 'ptb', 'PEG_trailing',
       'divyield']

per_share_chars = ['dividend_p','BE_p','Liability_p','cur_liability_p','LT_debt_p',
                  'cash_p', 'total_asset_p', 'tot_debt_p', 'accrual_p', 'EBIT_p', 
                   'cur_asset_p', 'pbda_p', 'ocf_p', 'inventory_p', 'receivables_p',
                   'Cur_debt_p', 'interest_p', 'fcf_ocf_p', 'evm_p',
                   'sales_p', 'invcap_p', 'c_equity_p', 'rd_p', 'opmad_p', 'gpm_p','ptpm_p'
                  ]

macro_chars = ['RGDP', 'RCON', 'INDPROD', 'UNEMP']

fundamental_chars = ['ret', 'prc',
                    'EPS_true_l1_q1','EPS_true_l1_q2','EPS_true_l1_q3',
                    'EPS_true_l1_y1','EPS_true_l1_y2',
                    ]

analyst_chars = ['EPS_ana_q1','EPS_ana_q2','EPS_ana_q3','EPS_ana_y1','EPS_ana_y2']

targets = ['EPS_true_q1', 'EPS_true_q2', 'EPS_true_q3', 'EPS_true_y1', 'EPS_true_y2']

In [7]:
# List to store reorganized data
model_data_all = []

# Define the predictor columns and label columns (based on your original loop)
for q in ['q1', 'q2', 'q3', 'y1', 'y2']:
    if q in ['q1', 'q2', 'q3']:
        months_back = 12
    else:
        months_back = 24 if q == 'y2' else 12

    # Define the predictor columns
    x_cols = ratio_chars + ['ret', 'prc', f'EPS_true_l1_{q}', f'EPS_ana_{q}'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']
    y_col = f'EPS_true_{q}'
    ann_date_col = f'ANNDATS_{q}'

    # Reorganize the whole dataset into predictor (X) and label (Y) pairs
    for t in df_tmp['YearMonth'].unique():
        df_data = df_tmp[df_tmp['YearMonth'] == t].copy()

        # Ensure there's no leakage of future information
        df_data = df_data[df_data[ann_date_col] <= t]  # Exclude future announcements

        # Drop NA and infinite values
        df_data = df_data.replace([np.inf, -np.inf], np.nan).dropna(subset=x_cols + [y_col])

        # Skip if the data is empty after cleaning
        if df_data.empty:
            continue

        # Create a DataFrame for the model
        df_model = df_data[x_cols].copy()
        df_model['label'] = df_data[y_col].values
        df_model['permno'] = df_data['permno'].values
        df_model['YearMonth'] = df_data['YearMonth'].values
        df_model['horizon'] = q  # Add the forecast horizon
        df_model['forecast_time'] = t  # Include the forecast time

        # Append to the list of model data
        model_data_all.append(df_model)

# Combine all model rows into one DataFrame
model_data_df = pd.concat(model_data_all, ignore_index=True)

model_data_df['prediction'] = np.nan  # placeholder

# Optional: Reorder columns for clarity
cols_first = ['permno', 'YearMonth', 'forecast_time', 'horizon', 'label', 'prediction']
other_cols = [col for col in model_data_df.columns if col not in cols_first]
model_data_df = model_data_df[cols_first + other_cols]

model_data_df.head()

Unnamed: 0,permno,YearMonth,forecast_time,horizon,label,prediction,CAPEI,bm,evm,pe_exi,...,INDPROD,UNEMP,EPS_true_l1_q2,EPS_ana_q2,EPS_true_l1_q3,EPS_ana_q3,EPS_true_l1_y1,EPS_ana_y1,EPS_true_l1_y2,EPS_ana_y2
0,10079.0,1989-03-31,1989-03-31,q1,-0.35,,21.085184,1.020966,-4.142299,-3.298611,...,0.00355,5.4,,,,,,,,
1,10094.0,1989-03-31,1989-03-31,q1,1.13,,8.323051,1.035691,5.501297,0.268518,...,0.00355,5.4,,,,,,,,
2,10104.0,1989-03-31,1989-03-31,q1,0.36,,72.320234,0.161515,10.875978,26.162766,...,0.00355,5.4,,,,,,,,
3,10171.0,1989-03-31,1989-03-31,q1,0.05,,16.07111,0.623432,6.882616,12.142323,...,0.00355,5.4,,,,,,,,
4,10179.0,1989-03-31,1989-03-31,q1,0.35,,11.771166,0.945508,9.975825,7.949438,...,0.00355,5.4,,,,,,,,


In [8]:
# Save or return the final model_data_df
model_data_df.to_csv('./data/Results/model_data_df.csv', index=False)

In [9]:
model_data_df.columns

Index(['permno', 'YearMonth', 'forecast_time', 'horizon', 'label',
       'prediction', 'CAPEI', 'bm', 'evm', 'pe_exi', 'pe_inc', 'ps', 'pcf',
       'dpr', 'npm', 'opmbd', 'opmad', 'gpm', 'ptpm', 'cfm', 'roa', 'roe',
       'roce', 'efftax', 'aftret_eq', 'aftret_invcapx', 'aftret_equity',
       'pretret_noa', 'pretret_earnat', 'GProf', 'equity_invcap',
       'debt_invcap', 'totdebt_invcap', 'capital_ratio', 'int_debt',
       'int_totdebt', 'cash_lt', 'invt_act', 'rect_act', 'debt_at',
       'debt_ebitda', 'short_debt', 'curr_debt', 'lt_debt', 'profit_lct',
       'ocf_lct', 'cash_debt', 'fcf_ocf', 'lt_ppent', 'dltt_be', 'debt_assets',
       'debt_capital', 'de_ratio', 'intcov', 'intcov_ratio', 'cash_ratio',
       'quick_ratio', 'curr_ratio', 'cash_conversion', 'inv_turn', 'at_turn',
       'rect_turn', 'pay_turn', 'sale_invcap', 'sale_equity', 'sale_nwc',
       'rd_sale', 'adv_sale', 'staff_sale', 'accrual', 'ptb', 'PEG_trailing',
       'divyield', 'ret', 'prc', 'EPS_true_l1_q

In [23]:
def generate_gpt_prompt_from_index(index, df):
    # Mapping of variable names to natural language descriptions
    variable_map = {
        'CAPEI': 'Shiller P/E ratio',
        'bm': 'Book-to-Market Ratio',
        'evm': 'Enterprise Value Multiple',
        'pe_exi': 'Price-to-Earnings (excl. extraordinary items)',
        'pe_inc': 'Price-to-Earnings (incl. extraordinary items)',
        'ps': 'Price-to-Sales Ratio',
        'pcf': 'Price-to-Cash Flow Ratio',
        'dpr': 'Dividend Payout Ratio',
        'npm': 'Net Profit Margin',
        'opmbd': 'Operating Margin Before Depreciation',
        'opmad': 'Operating Margin After Depreciation',
        'gpm': 'Gross Profit Margin',
        'ptpm': 'Pretax Profit Margin',
        'cfm': 'Cash Flow Margin',
        'roa': 'Return on Assets',
        'roe': 'Return on Equity',
        'roce': 'Return on Capital Employed',
        'efftax': 'Effective Tax Rate',
        'aftret_eq': 'After-Tax Return on Equity',
        'aftret_invcapx': 'After-Tax Return on Invested Capital',
        'aftret_equity': 'After-Tax Return on Equity (again)',
        'pretret_noa': 'Pretax Return on Net Operating Assets',
        'pretret_earnat': 'Pretax Return on Earnings Assets',
        'GProf': 'Gross Profitability',
        'equity_invcap': 'Equity to Invested Capital',
        'debt_invcap': 'Debt to Invested Capital',
        'totdebt_invcap': 'Total Debt to Invested Capital',
        'capital_ratio': 'Capital Ratio',
        'int_debt': 'Interest-Bearing Debt',
        'int_totdebt': 'Interest-Bearing to Total Debt',
        'cash_lt': 'Cash to Long-Term Assets',
        'invt_act': 'Inventory to Current Assets',
        'rect_act': 'Receivables to Current Assets',
        'debt_at': 'Debt to Total Assets',
        'debt_ebitda': 'Debt to EBITDA',
        'short_debt': 'Short-Term Debt',
        'curr_debt': 'Current Debt',
        'lt_debt': 'Long-Term Debt',
        'profit_lct': 'Profit to Liabilities',
        'ocf_lct': 'Operating Cash Flow to Liabilities',
        'cash_debt': 'Cash to Debt',
        'fcf_ocf': 'Free Cash Flow to Operating Cash Flow',
        'lt_ppent': 'Long-Term Assets to PP&E',
        'dltt_be': 'Long-Term Debt to Book Equity',
        'debt_assets': 'Debt to Total Assets',
        'debt_capital': 'Debt to Capital',
        'de_ratio': 'Debt-to-Equity Ratio',
        'intcov': 'Interest Coverage',
        'intcov_ratio': 'Interest Coverage Ratio',
        'cash_ratio': 'Cash Ratio',
        'quick_ratio': 'Quick Ratio',
        'curr_ratio': 'Current Ratio',
        'cash_conversion': 'Cash Conversion Cycle',
        'inv_turn': 'Inventory Turnover',
        'at_turn': 'Asset Turnover',
        'rect_turn': 'Receivables Turnover',
        'pay_turn': 'Payables Turnover',
        'sale_invcap': 'Sales to Invested Capital',
        'sale_equity': 'Sales to Equity',
        'sale_nwc': 'Sales to Net Working Capital',
        'rd_sale': 'R&D to Sales',
        'adv_sale': 'Advertising to Sales',
        'staff_sale': 'Staff Expense to Sales',
        'accrual': 'Accruals',
        'ptb': 'Price-to-Book Ratio',
        'PEG_trailing': 'PEG Ratio (Trailing)',
        'divyield': 'Dividend Yield',
        'ret': 'Stock Return',
        'prc': 'Price',
        'EPS_true_l1_q1': 'Lagged True EPS for q1',
        'EPS_ana_q1': 'Analyst Forecast EPS for q1',
        'RGDP': 'Real GDP',
        'RCON': 'Consumer Spending',
        'INDPROD': 'Industrial Production',
        'UNEMP': 'Unemployment Rate',
        'EPS_true_l1_q2': 'Lagged True EPS for q2',
        'EPS_ana_q2': 'Analyst Forecast EPS for q2',
        'EPS_true_l1_q3': 'Lagged True EPS for q3',
        'EPS_ana_q3': 'Analyst Forecast EPS for q3',
        'EPS_true_l1_y1': 'Lagged True EPS for y1',
        'EPS_ana_y1': 'Analyst Forecast EPS for y1',
        'EPS_true_l1_y2': 'Lagged True EPS for y2',
        'EPS_ana_y2': 'Analyst Forecast EPS for y2',
    }

    row = df.loc[index]
    # Format values with their descriptions
    formatted_pairs = [f"{variable_map[col]}: {row[col]}" for col in df.columns if col in variable_map]
    # Combine all into a string with spacing, not newlines
    joined_description = "  ".join(formatted_pairs)

    prompt = (
        "I'm analyzing a firm and would like you to predict its upcoming earnings per share. "
        "Here are the financial and market characteristics for the firm: "
        f"{joined_description} "
        "Can you forecast the firm’s future earnings for quarters q1, q2, q3, y1, and y2 based on these characteristics? "
        "Please do not search the web."
    )
    return prompt


In [24]:
generate_gpt_prompt_from_index(0, model_data_df)

"I'm analyzing a firm and would like you to predict its upcoming earnings per share. Here are the financial and market characteristics for the firm: Shiller P/E ratio: 21.085184337073507  Book-to-Market Ratio: 1.0209656925031767  Enterprise Value Multiple: -4.142299448867115  Price-to-Earnings (excl. extraordinary items): -3.298611111111111  Price-to-Earnings (incl. extraordinary items): -3.298611111111111  Price-to-Sales Ratio: 0.8162163599617063  Price-to-Cash Flow Ratio: -3.327515177797053  Dividend Payout Ratio: 0.0  Net Profit Margin: -0.2407190724391022  Operating Margin Before Depreciation: -0.34740985001595576  Operating Margin After Depreciation: -0.4003829379853207  Gross Profit Margin: 0.2285927029039464  Pretax Profit Margin: -0.4064461227528986  Cash Flow Margin: -0.17998085310073397  Return on Assets: -0.21659460754194718  Return on Equity: -0.23476321385964002  Return on Capital Employed: -0.3485687012194894  Effective Tax Rate: 0.3603448275862069  After-Tax Return on Eq

### Y1 only

In [25]:
def generate_gpt_prompt_from_index_y1(index, df):
    # Mapping of variable names to natural language descriptions
    variable_map = {
        'CAPEI': 'Shiller P/E ratio',
        'bm': 'Book-to-Market Ratio',
        'evm': 'Enterprise Value Multiple',
        'pe_exi': 'Price-to-Earnings (excl. extraordinary items)',
        'pe_inc': 'Price-to-Earnings (incl. extraordinary items)',
        'ps': 'Price-to-Sales Ratio',
        'pcf': 'Price-to-Cash Flow Ratio',
        'dpr': 'Dividend Payout Ratio',
        'npm': 'Net Profit Margin',
        'opmbd': 'Operating Margin Before Depreciation',
        'opmad': 'Operating Margin After Depreciation',
        'gpm': 'Gross Profit Margin',
        'ptpm': 'Pretax Profit Margin',
        'cfm': 'Cash Flow Margin',
        'roa': 'Return on Assets',
        'roe': 'Return on Equity',
        'roce': 'Return on Capital Employed',
        'efftax': 'Effective Tax Rate',
        'aftret_eq': 'After-Tax Return on Equity',
        'aftret_invcapx': 'After-Tax Return on Invested Capital',
        'aftret_equity': 'After-Tax Return on Equity (again)',
        'pretret_noa': 'Pretax Return on Net Operating Assets',
        'pretret_earnat': 'Pretax Return on Earnings Assets',
        'GProf': 'Gross Profitability',
        'equity_invcap': 'Equity to Invested Capital',
        'debt_invcap': 'Debt to Invested Capital',
        'totdebt_invcap': 'Total Debt to Invested Capital',
        'capital_ratio': 'Capital Ratio',
        'int_debt': 'Interest-Bearing Debt',
        'int_totdebt': 'Interest-Bearing to Total Debt',
        'cash_lt': 'Cash to Long-Term Assets',
        'invt_act': 'Inventory to Current Assets',
        'rect_act': 'Receivables to Current Assets',
        'debt_at': 'Debt to Total Assets',
        'debt_ebitda': 'Debt to EBITDA',
        'short_debt': 'Short-Term Debt',
        'curr_debt': 'Current Debt',
        'lt_debt': 'Long-Term Debt',
        'profit_lct': 'Profit to Liabilities',
        'ocf_lct': 'Operating Cash Flow to Liabilities',
        'cash_debt': 'Cash to Debt',
        'fcf_ocf': 'Free Cash Flow to Operating Cash Flow',
        'lt_ppent': 'Long-Term Assets to PP&E',
        'dltt_be': 'Long-Term Debt to Book Equity',
        'debt_assets': 'Debt to Total Assets',
        'debt_capital': 'Debt to Capital',
        'de_ratio': 'Debt-to-Equity Ratio',
        'intcov': 'Interest Coverage',
        'intcov_ratio': 'Interest Coverage Ratio',
        'cash_ratio': 'Cash Ratio',
        'quick_ratio': 'Quick Ratio',
        'curr_ratio': 'Current Ratio',
        'cash_conversion': 'Cash Conversion Cycle',
        'inv_turn': 'Inventory Turnover',
        'at_turn': 'Asset Turnover',
        'rect_turn': 'Receivables Turnover',
        'pay_turn': 'Payables Turnover',
        'sale_invcap': 'Sales to Invested Capital',
        'sale_equity': 'Sales to Equity',
        'sale_nwc': 'Sales to Net Working Capital',
        'rd_sale': 'R&D to Sales',
        'adv_sale': 'Advertising to Sales',
        'staff_sale': 'Staff Expense to Sales',
        'accrual': 'Accruals',
        'ptb': 'Price-to-Book Ratio',
        'PEG_trailing': 'PEG Ratio (Trailing)',
        'divyield': 'Dividend Yield',
        'ret': 'Stock Return',
        'prc': 'Price',
        'EPS_true_l1_q1': 'Lagged True EPS for q1',
        'EPS_ana_q1': 'Analyst Forecast EPS for q1',
        'RGDP': 'Real GDP',
        'RCON': 'Consumer Spending',
        'INDPROD': 'Industrial Production',
        'UNEMP': 'Unemployment Rate',
        'EPS_true_l1_q2': 'Lagged True EPS for q2',
        'EPS_ana_q2': 'Analyst Forecast EPS for q2',
        'EPS_true_l1_q3': 'Lagged True EPS for q3',
        'EPS_ana_q3': 'Analyst Forecast EPS for q3',
        'EPS_true_l1_y1': 'Lagged True EPS for y1',
        'EPS_ana_y1': 'Analyst Forecast EPS for y1',
        'EPS_true_l1_y2': 'Lagged True EPS for y2',
        'EPS_ana_y2': 'Analyst Forecast EPS for y2',
    }

    row = df.loc[index]

    # Filter only non-NA columns in the variable map
    formatted_pairs = [
        f"{variable_map[col]}: {row[col]}"
        for col in df.columns
        if col in variable_map and pd.notnull(row[col])
    ]
    joined_description = "  ".join(formatted_pairs)

    prompt = (
        "I'm analyzing a firm and would like you to predict its upcoming earnings per share. "
        "Here are the financial and market characteristics for the firm: "
        f"{joined_description} "
        "Can you forecast the firm’s EPS next year based on these characteristics? "
        "Please do not serach the web."
    )
    return prompt

In [26]:
df_y1 = model_data_df[model_data_df['horizon'] == 'y1'].copy()
df_y1 = df_y1.reset_index(drop=True)
df_y1.head

<bound method NDFrame.head of         permno  YearMonth forecast_time horizon   label  prediction  \
0      10061.0 1989-03-31    1989-03-31      y1 -1.5600         NaN   
1      10094.0 1989-03-31    1989-03-31      y1  0.9000         NaN   
2      10154.0 1989-03-31    1989-03-31      y1  0.5000         NaN   
3      10171.0 1989-03-31    1989-03-31      y1 -0.1200         NaN   
4      10182.0 1989-03-31    1989-03-31      y1  1.3900         NaN   
...        ...        ...           ...     ...     ...         ...   
59183  89456.0 1986-08-31    1986-08-31      y1  0.7995         NaN   
59184  90107.0 1986-08-31    1986-08-31      y1  0.7000         NaN   
59185  90799.0 1986-08-31    1986-08-31      y1  0.3600         NaN   
59186  91986.0 1986-08-31    1986-08-31      y1  0.5300         NaN   
59187  92217.0 1986-08-31    1986-08-31      y1  0.3000         NaN   

           CAPEI        bm        evm     pe_exi  ...   INDPROD  UNEMP  \
0      11.771166  0.945508   9.975825   7.9

In [27]:
generate_gpt_prompt_from_index_y1(0, df_y1)

"I'm analyzing a firm and would like you to predict its upcoming earnings per share. Here are the financial and market characteristics for the firm: Shiller P/E ratio: 11.771165985889342  Book-to-Market Ratio: 0.9455078663679781  Enterprise Value Multiple: 9.975825292919113  Price-to-Earnings (excl. extraordinary items): 7.94943820224719  Price-to-Earnings (incl. extraordinary items): 8.012820512820513  Price-to-Sales Ratio: 0.7074598743893928  Price-to-Cash Flow Ratio: 1.539838112909351  Dividend Payout Ratio: 0.290822303564008  Net Profit Margin: 0.08894823860576437  Operating Margin Before Depreciation: 0.22874678080690078  Operating Margin After Depreciation: 0.2283206553540082  Gross Profit Margin: 0.4470658907038776  Pretax Profit Margin: 0.1241031907877857  Cash Flow Margin: 0.0858921186713642  Return on Assets: 0.02427693307569247  Return on Equity: 0.15971459018871495  Return on Capital Employed: 0.09732588872764134  Effective Tax Rate: 0.25489394462874493  After-Tax Return on