# Import the libraries

In [1]:
import pandas as pd
from linearmodels import PanelOLS
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

# Set the font to Times New Roman
matplotlib.rcParams['font.family'] = 'serif'
matplotlib.rcParams['font.serif'] = 'Times New Roman'

# Prepare the data

In [2]:
ind_col = 'naicsh5'
nlp_model = 'mpnet'
doc2vec_vector_size = 768 # 768 for mpnet, 1536 for ada

data = pd.read_pickle(f'../data/all_reg_vars_{nlp_model}_{str(doc2vec_vector_size)}_{ind_col}.pkl')
data.replace([np.inf, -np.inf], np.nan, inplace=True)


### Create lead variables for the DVs

In [3]:
# Sort data by GVKEY and year
reg_data = data.sort_values(by=['GVKEY', 'year'])
# Lead dependent variables by one year
reg_data['numrec_mean_0_lead'] = reg_data.groupby('GVKEY')['numrec_mean_0'].shift(-1)
reg_data['meanrec_mean_descend_lead'] = reg_data.groupby('GVKEY')['meanrec_mean_descend'].shift(-1)
reg_data['year_lead'] = reg_data.groupby('GVKEY')['year'].shift(-1)
reg_data['year_diff'] = reg_data['year_lead'] - reg_data['year']
reg_data = reg_data[reg_data['year_diff'] <=2 ]
reg_data['constant'] = 1

In [4]:
# Limit the data to non-exemplar firms and industries with at least 10 firms
reg_data = reg_data[(reg_data[f'exemplar_{ind_col}'] == 0) & (reg_data[f'no_firms_{ind_col}'] > 9)]

In [5]:
reg_data['year_lead'] = reg_data.groupby('GVKEY')['year'].shift(-1)


### Rename the variables to make them more readable

In [6]:
# define a dictionary that maps old names to new names
rename_dict = {
    'numrec_mean_0_lead': 'Analyst Coverage',
    'meanrec_mean_descend_lead': 'Analyst Recoms',
    f'exemplar_sim_{ind_col}': 'Exemplar Similarity',
    'sale_wins_1': 'Total Sales',
    'n_emp': 'Firm Size',
    'EPS_wins_1': 'EPS',
    'slack_avail_wins_1': 'Available Slack',
    'rd_f_wins_1': 'R&D Expenditure',
    'adv_f_wins_1': 'Advertising Expenditure',
    'dpt_f_wins_1': 'Depreciation Ratio',
    'intang_f_wins_1': 'Intangible Assets Ratio',
    'n_segments': 'No. Segments',
    f'market_share_{ind_col}': 'Market Share',
    'mergers_wins_1': 'Mergers (expenditure)',
    'leverage_wins_1': 'Financial Leverage',
    'is_spx500': 'S&P500 Dummy',
    f'sim_mean_{ind_col}': 'Firm Typicality',
    'strong_weak_modal_ratio': 'Strong-Weak Modals Ratio',
    'positive_negative_ratio': 'Positive-Negative Words Ratio',
    'N_Litigious': 'Litigous Words Ratio',
    f'n_analysts_{ind_col}': 'No. Analysts in Industry',
    f'no_firms_{ind_col}': 'No. Firms in Industry',
    f'category_coherence_{ind_col}': 'Category Coherence',
    f'numrec_avg_year_{ind_col}': 'Average Coverage (Year-Ind)',
    f'meanrec_avg_year_{ind_col}': 'Average Recoms (Year-Ind)',
    f'ind_sim_all_{ind_col}_distinct': 'Category Distinctiveness',
    f'ind_vecs_change_{ind_col}': 'Category Instability',
    f'hhi_{ind_col}': 'Industry HHI',
    f'EPS_wins_1_ex_{ind_col}': 'Exemplar EPS',
    f'exemplar_{ind_col}_typicality': 'Exemplar Typicality',
    'constant': 'Constant'
}

# rename columns
reg_data = reg_data.rename(columns=rename_dict)


### Define the list of independent/control variables


In [7]:
iv = ['Exemplar Similarity']
moderators = ['Category Coherence', 'Category Distinctiveness','Exemplar Typicality']
controls = [
            'Total Sales', 'Firm Size','Market Share', 'EPS', 'Available Slack', 'R&D Expenditure',
            'Advertising Expenditure', 'Intangible Assets Ratio', 'Depreciation Ratio', 
            'Firm Typicality',  'No. Segments',
             'Mergers (expenditure)', 'Financial Leverage', 'S&P500 Dummy', 
             'No. Analysts in Industry', 'No. Firms in Industry',  
             'Average Coverage (Year-Ind)', 'Average Recoms (Year-Ind)',
             'Category Instability', 'Industry HHI', 'Exemplar EPS',
             'Constant']
dv = ['Analyst Coverage', 'Analyst Recoms']

In [8]:
# Set index to GVKEY and year
reg_data = reg_data.set_index(['GVKEY', 'year'])

# Regression Models

## Effect of Exemplar similarity on Analyst Coverage

In [9]:
regressors = iv + moderators + controls
model1 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res1 = model1.fit(cov_type='clustered', cluster_entity=True)
res1


0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1168
Estimator:,PanelOLS,R-squared (Between):,0.2070
No. Observations:,53369,R-squared (Within):,0.1168
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.1765
Time:,15:37:13,Log-likelihood,-1.207e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,238.08
Entities:,8326,P-value,0.0000
Avg Obs:,6.4099,Distribution:,"F(25,45018)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,2.4428,0.5330,4.5834,0.0000,1.3982,3.4874
Category Coherence,19.430,2.1996,8.8333,0.0000,15.119,23.741
Category Distinctiveness,-0.5177,1.7057,-0.3035,0.7615,-3.8609,2.8254
Exemplar Typicality,-1.0441,0.5976,-1.7473,0.0806,-2.2154,0.1271
Total Sales,0.0002,5.208e-05,3.0553,0.0022,5.705e-05,0.0003
Firm Size,0.0012,0.0101,0.1150,0.9085,-0.0187,0.0210
Market Share,1.0619,2.4254,0.4378,0.6615,-3.6918,5.8157
EPS,0.0817,0.0144,5.6763,0.0000,0.0535,0.1099
Available Slack,0.0115,0.0084,1.3632,0.1728,-0.0050,0.0280


## Effect of Exemplar Similarity on Analyst Recommendations

In [10]:
regressors = iv + moderators + controls
model2 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res2 = model2.fit(cov_type='clustered', cluster_entity=True)
res2

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0490
Estimator:,PanelOLS,R-squared (Between):,-0.0008
No. Observations:,34911,R-squared (Within):,0.0490
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.0276
Time:,15:37:14,Log-likelihood,-1.501e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,58.445
Entities:,6558,P-value,0.0000
Avg Obs:,5.3234,Distribution:,"F(25,28328)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-0.2507,0.1048,-2.3931,0.0167,-0.4561,-0.0454
Category Coherence,-1.9349,0.4046,-4.7821,0.0000,-2.7279,-1.1418
Category Distinctiveness,-0.2818,0.2645,-1.0655,0.2866,-0.8002,0.2366
Exemplar Typicality,-0.0546,0.1229,-0.4441,0.6570,-0.2956,0.1864
Total Sales,-7.375e-06,2.78e-06,-2.6527,0.0080,-1.282e-05,-1.926e-06
Firm Size,-0.0021,0.0013,-1.6483,0.0993,-0.0047,0.0004
Market Share,-0.5986,0.2391,-2.5034,0.0123,-1.0673,-0.1299
EPS,0.0147,0.0020,7.2630,0.0000,0.0107,0.0187
Available Slack,-0.0034,0.0018,-1.9165,0.0553,-0.0069,7.74e-05


## Moderation Effect of Category Coherence

In [11]:
# Regressions with interaction terms - analyst coverage and category coherence
interaction_term = 'Exemplar Similarity x Category Coherence'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Coherence']
regressors = iv + moderators + [interaction_term] + controls

model3 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res3 = model3.fit(cov_type='clustered', cluster_entity=True)
res3


0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1204
Estimator:,PanelOLS,R-squared (Between):,0.1972
No. Observations:,53369,R-squared (Within):,0.1204
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.1674
Time:,15:37:15,Log-likelihood,-1.206e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,237.04
Entities:,8326,P-value,0.0000
Avg Obs:,6.4099,Distribution:,"F(26,45017)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-56.227,7.3244,-7.6766,0.0000,-70.583,-41.871
Category Coherence,-37.144,7.3637,-5.0442,0.0000,-51.577,-22.711
Category Distinctiveness,-0.8450,1.6894,-0.5002,0.6169,-4.1564,2.4663
Exemplar Typicality,-1.2049,0.5879,-2.0495,0.0404,-2.3572,-0.0526
Exemplar Similarity x Category Coherence,67.740,8.4978,7.9714,0.0000,51.084,84.395
Total Sales,0.0002,5.163e-05,3.0183,0.0025,5.464e-05,0.0003
Firm Size,0.0015,0.0101,0.1452,0.8846,-0.0184,0.0214
Market Share,0.8937,2.3987,0.3726,0.7095,-3.8078,5.5951
EPS,0.0826,0.0143,5.7778,0.0000,0.0546,0.1107


In [12]:
interaction_term = 'Exemplar Similarity x Category Coherence'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Coherence']
regressors = iv + moderators + [interaction_term] + controls

model4 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res4 = model4.fit(cov_type='clustered', cluster_entity=True)
res4

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0493
Estimator:,PanelOLS,R-squared (Between):,0.0033
No. Observations:,34911,R-squared (Within):,0.0493
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.0321
Time:,15:37:15,Log-likelihood,-1.5e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,56.499
Entities:,6558,P-value,0.0000
Avg Obs:,5.3234,Distribution:,"F(26,28327)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,2.5905,1.4193,1.8251,0.0680,-0.1915,5.3725
Category Coherence,0.8570,1.4106,0.6075,0.5435,-1.9079,3.6219
Category Distinctiveness,-0.2783,0.2646,-1.0517,0.2930,-0.7969,0.2404
Exemplar Typicality,-0.0484,0.1227,-0.3948,0.6930,-0.2888,0.1920
Exemplar Similarity x Category Coherence,-3.2610,1.6173,-2.0163,0.0438,-6.4309,-0.0910
Total Sales,-7.295e-06,2.77e-06,-2.6331,0.0085,-1.272e-05,-1.865e-06
Firm Size,-0.0021,0.0013,-1.6467,0.0996,-0.0047,0.0004
Market Share,-0.5840,0.2381,-2.4522,0.0142,-1.0507,-0.1172
EPS,0.0147,0.0020,7.2623,0.0000,0.0107,0.0186


## Moderation Effect of Category Distinctiveness

In [13]:
# Regressions with interaction terms - analyst coverage and category coherence
interaction_term = 'Exemplar Similarity x Category Distinctiveness'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Distinctiveness']
regressors = iv + moderators + [interaction_term] + controls

model5 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res5 = model5.fit(cov_type='clustered', cluster_entity=True)
res5


0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1190
Estimator:,PanelOLS,R-squared (Between):,0.2113
No. Observations:,53369,R-squared (Within):,0.1190
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.1804
Time:,15:37:16,Log-likelihood,-1.206e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,233.86
Entities:,8326,P-value,0.0000
Avg Obs:,6.4099,Distribution:,"F(26,45017)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,36.610,4.8542,7.5419,0.0000,27.096,46.124
Category Coherence,18.981,2.1913,8.6620,0.0000,14.686,23.276
Category Distinctiveness,-35.993,5.1126,-7.0399,0.0000,-46.013,-25.972
Exemplar Typicality,-0.8322,0.5989,-1.3896,0.1647,-2.0061,0.3416
Exemplar Similarity x Category Distinctiveness,41.112,5.8057,7.0814,0.0000,29.733,52.492
Total Sales,0.0002,5.229e-05,3.0035,0.0027,5.457e-05,0.0003
Firm Size,0.0014,0.0102,0.1337,0.8937,-0.0185,0.0213
Market Share,1.0899,2.4240,0.4496,0.6530,-3.6612,5.8409
EPS,0.0823,0.0143,5.7411,0.0000,0.0542,0.1105


In [14]:
interaction_term = 'Exemplar Similarity x Category Distinctiveness'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Distinctiveness']
regressors = iv + moderators + [interaction_term] + controls

model6 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res6 = model6.fit(cov_type='clustered', cluster_entity=True)
res6

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0494
Estimator:,PanelOLS,R-squared (Between):,-0.0082
No. Observations:,34911,R-squared (Within):,0.0494
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.0233
Time:,15:37:16,Log-likelihood,-1.5e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,56.611
Entities:,6558,P-value,0.0000
Avg Obs:,5.3234,Distribution:,"F(26,28327)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-2.6669,1.0757,-2.4792,0.0132,-4.7753,-0.5585
Category Coherence,-1.8995,0.4046,-4.6951,0.0000,-2.6925,-1.1065
Category Distinctiveness,2.2349,1.1174,2.0001,0.0455,0.0448,4.4251
Exemplar Typicality,-0.0680,0.1231,-0.5526,0.5806,-0.3094,0.1733
Exemplar Similarity x Category Distinctiveness,-2.8991,1.2884,-2.2502,0.0244,-5.4243,-0.3738
Total Sales,-7.243e-06,2.779e-06,-2.6067,0.0091,-1.269e-05,-1.797e-06
Firm Size,-0.0021,0.0013,-1.6535,0.0982,-0.0047,0.0004
Market Share,-0.5996,0.2384,-2.5152,0.0119,-1.0669,-0.1324
EPS,0.0146,0.0020,7.2400,0.0000,0.0107,0.0186


## Moderation Effect of Exemplar Typicality

In [15]:
# Regressions with interaction terms - analyst coverage and category coherence
interaction_term = 'Exemplar Similarity x Exemplar Typicality'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Exemplar Typicality']
regressors = iv + moderators + [interaction_term] + controls

model7 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res7 = model7.fit(cov_type='clustered', cluster_entity=True)
res7

0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1180
Estimator:,PanelOLS,R-squared (Between):,0.2051
No. Observations:,53369,R-squared (Within):,0.1180
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.1747
Time:,15:37:16,Log-likelihood,-1.206e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,231.54
Entities:,8326,P-value,0.0000
Avg Obs:,6.4099,Distribution:,"F(26,45017)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-15.522,3.2398,-4.7910,0.0000,-21.872,-9.1719
Category Coherence,19.376,2.1803,8.8865,0.0000,15.102,23.649
Category Distinctiveness,-0.7341,1.7019,-0.4314,0.6662,-4.0698,2.6016
Exemplar Typicality,-17.917,3.0685,-5.8390,0.0000,-23.931,-11.903
Exemplar Similarity x Exemplar Typicality,20.996,3.8799,5.4116,0.0000,13.392,28.601
Total Sales,0.0002,5.197e-05,3.0292,0.0025,5.557e-05,0.0003
Firm Size,0.0012,0.0102,0.1161,0.9076,-0.0188,0.0212
Market Share,1.0609,2.4170,0.4389,0.6607,-3.6764,5.7982
EPS,0.0820,0.0144,5.7093,0.0000,0.0538,0.1101


In [16]:
interaction_term = 'Exemplar Similarity x Exemplar Typicality'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Exemplar Typicality']
regressors = iv + moderators + [interaction_term] + controls

model8 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res8 = model8.fit(cov_type='clustered', cluster_entity=True)
res8

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0493
Estimator:,PanelOLS,R-squared (Between):,0.0017
No. Observations:,34911,R-squared (Within):,0.0493
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.0303
Time:,15:37:17,Log-likelihood,-1.5e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,56.506
Entities:,6558,P-value,0.0000
Avg Obs:,5.3234,Distribution:,"F(26,28327)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,1.2643,0.6732,1.8779,0.0604,-0.0553,2.5838
Category Coherence,-1.9206,0.4023,-4.7743,0.0000,-2.7091,-1.1321
Category Distinctiveness,-0.2671,0.2649,-1.0084,0.3133,-0.7862,0.2521
Exemplar Typicality,1.3840,0.6443,2.1479,0.0317,0.1210,2.6469
Exemplar Similarity x Exemplar Typicality,-1.7631,0.7736,-2.2790,0.0227,-3.2794,-0.2467
Total Sales,-7.321e-06,2.774e-06,-2.6394,0.0083,-1.276e-05,-1.884e-06
Firm Size,-0.0021,0.0013,-1.6408,0.1009,-0.0047,0.0004
Market Share,-0.5896,0.2391,-2.4655,0.0137,-1.0583,-0.1209
EPS,0.0147,0.0020,7.2868,0.0000,0.0107,0.0187


# Output regression results

In [17]:
def create_table(*results):
    table = pd.DataFrame()
    for i, result in enumerate(results, start=1):
        coefficients = result.params
        pvalues = result.pvalues
        nobs = result.nobs
        rsquared = "{:.3f}".format(result.rsquared)
        nentities = result._entity_info['total']

        # Create a DataFrame for this result
        data = {}
        for key in coefficients.keys():
            data[key] = ["{:.3f}".format(coefficients[key])]
            data[key + ' p-val'] = ["'({:.3f})".format(pvalues[key])]  # added single quote
        data['Observations'] = [nobs]
        data['R-squared'] = [rsquared]
        data['Entities'] = [nentities]
        df = pd.DataFrame(data)

        # Assign model name to the index
        df.index = ['Model' + str(i)]

        # Concatenate to the final table
        table = pd.concat([table, df], axis=0)
    return table

In [18]:

# Now create the table with your results:
table = create_table(res1, res2, res3, res4, res5, res6, res7, res8)


In [19]:
table.T.to_csv(f'../data/tables/main_regression_results_naicsh5.csv')