# Import the libraries

In [39]:
import pandas as pd
from linearmodels import PanelOLS
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

# Set the font to Times New Roman
matplotlib.rcParams['font.family'] = 'serif'
matplotlib.rcParams['font.serif'] = 'Times New Roman'

# Prepare the data

In [40]:
ind_col = 'naicsh6'
nlp_model = 'mpnet'
doc2vec_vector_size = 768 # 768 for mpnet, 1536 for ada

data = pd.read_pickle(f'../data/all_reg_vars_{nlp_model}_{str(doc2vec_vector_size)}_{ind_col}.pkl')
data.replace([np.inf, -np.inf], np.nan, inplace=True)


### Create lead variables for the DVs

In [41]:
# Sort data by GVKEY and year
reg_data = data.sort_values(by=['GVKEY', 'year'])
# Lead dependent variables by one year
reg_data['numrec_mean_0_lead'] = reg_data.groupby('GVKEY')['numrec_mean_0'].shift(-1)
reg_data['meanrec_mean_descend_lead'] = reg_data.groupby('GVKEY')['meanrec_mean_descend'].shift(-1)
reg_data['year_lead'] = reg_data.groupby('GVKEY')['year'].shift(-1)
reg_data['year_diff'] = reg_data['year_lead'] - reg_data['year']
reg_data = reg_data[reg_data['year_diff'] <=2 ]
reg_data['constant'] = 1

In [42]:
# Limit the data to non-exemplar firms and industries with at least 10 firms
reg_data = reg_data[(reg_data[f'exemplar_{ind_col}'] == 0) & (reg_data[f'no_firms_{ind_col}'] > 4)]

In [43]:
reg_data['year_lead'] = reg_data.groupby('GVKEY')['year'].shift(-1)


### Rename the variables to make them more readable

In [44]:
# define a dictionary that maps old names to new names
rename_dict = {
    'numrec_mean_0_lead': 'Analyst Coverage',
    'meanrec_mean_descend_lead': 'Analyst Recoms',
    f'exemplar_sim_{ind_col}': 'Exemplar Similarity',
    'sale_wins_1': 'Total Sales',
    'n_emp': 'Firm Size',
    'EPS_wins_1': 'EPS',
    'slack_avail_wins_1': 'Available Slack',
    'rd_f_wins_1': 'R&D Expenditure',
    'adv_f_wins_1': 'Advertising Expenditure',
    'dpt_f_wins_1': 'Depreciation Ratio',
    'intang_f_wins_1': 'Intangible Assets Ratio',
    'n_segments': 'No. Segments',
    f'market_share_{ind_col}': 'Market Share',
    'mergers_wins_1': 'Mergers (expenditure)',
    'leverage_wins_1': 'Financial Leverage',
    'is_spx500': 'S&P500 Dummy',
    f'sim_mean_{ind_col}': 'Firm Typicality',
    'strong_weak_modal_ratio': 'Strong-Weak Modals Ratio',
    'positive_negative_ratio': 'Positive-Negative Words Ratio',
    'N_Litigious': 'Litigous Words Ratio',
    f'n_analysts_{ind_col}': 'No. Analysts in Industry',
    f'no_firms_{ind_col}': 'No. Firms in Industry',
    f'category_coherence_{ind_col}': 'Category Coherence',
    f'numrec_avg_year_{ind_col}': 'Average Coverage (Year-Ind)',
    f'meanrec_avg_year_{ind_col}': 'Average Recoms (Year-Ind)',
    f'ind_sim_all_{ind_col}_distinct': 'Category Distinctiveness',
    f'ind_vecs_change_{ind_col}': 'Category Instability',
    f'hhi_{ind_col}': 'Industry HHI',
    f'EPS_wins_1_ex_{ind_col}': 'Exemplar EPS',
    f'exemplar_{ind_col}_typicality': 'Exemplar Typicality',
    'constant': 'Constant'
}

# rename columns
reg_data = reg_data.rename(columns=rename_dict)


### Define the list of independent/control variables


In [45]:
iv = ['Exemplar Similarity']
moderators = ['Category Coherence', 'Category Distinctiveness','Exemplar Typicality']
controls = [
            'Total Sales', 'Firm Size','Market Share', 'EPS', 'Available Slack', 'R&D Expenditure',
            'Advertising Expenditure', 'Intangible Assets Ratio', 'Depreciation Ratio', 
            'Firm Typicality',  'No. Segments',
             'Mergers (expenditure)', 'Financial Leverage', 'S&P500 Dummy', 
             'No. Analysts in Industry', 'No. Firms in Industry',  
             'Average Coverage (Year-Ind)', 'Average Recoms (Year-Ind)',
             'Category Instability', 'Industry HHI', 'Exemplar EPS',
             'Constant']
dv = ['Analyst Coverage', 'Analyst Recoms']

In [46]:
# Set index to GVKEY and year
reg_data = reg_data.set_index(['GVKEY', 'year'])

# Regression Models

## Effect of Exemplar similarity on Analyst Coverage

In [47]:
regressors = iv + moderators + controls
model1 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res1 = model1.fit(cov_type='clustered', cluster_entity=True)
res1


0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1286
Estimator:,PanelOLS,R-squared (Between):,0.2322
No. Observations:,51216,R-squared (Within):,0.1286
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.2040
Time:,16:12:47,Log-likelihood,-1.164e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,253.03
Entities:,8324,P-value,0.0000
Avg Obs:,6.1528,Distribution:,"F(25,42867)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,2.4100,0.5333,4.5188,0.0000,1.3647,3.4554
Category Coherence,19.447,2.1695,8.9636,0.0000,15.195,23.699
Category Distinctiveness,-3.1592,1.8251,-1.7310,0.0835,-6.7365,0.4180
Exemplar Typicality,-1.4335,0.6665,-2.1508,0.0315,-2.7399,-0.1271
Total Sales,0.0002,4.681e-05,3.3225,0.0009,6.377e-05,0.0002
Firm Size,0.0061,0.0071,0.8555,0.3923,-0.0078,0.0199
Market Share,1.2175,1.4697,0.8284,0.4074,-1.6632,4.0983
EPS,0.0790,0.0150,5.2826,0.0000,0.0497,0.1083
Available Slack,0.0063,0.0086,0.7274,0.4670,-0.0107,0.0232


## Effect of Exemplar Similarity on Analyst Recommendations

In [48]:
regressors = iv + moderators + controls
model2 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res2 = model2.fit(cov_type='clustered', cluster_entity=True)
res2

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0554
Estimator:,PanelOLS,R-squared (Between):,0.0351
No. Observations:,33992,R-squared (Within):,0.0554
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.0393
Time:,16:12:47,Log-likelihood,-1.411e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,64.256
Entities:,6598,P-value,0.0000
Avg Obs:,5.1519,Distribution:,"F(25,27369)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-0.2154,0.1066,-2.0206,0.0433,-0.4243,-0.0065
Category Coherence,-1.1758,0.3757,-3.1298,0.0018,-1.9121,-0.4394
Category Distinctiveness,-0.2844,0.2474,-1.1496,0.2503,-0.7693,0.2005
Exemplar Typicality,-0.0854,0.1380,-0.6187,0.5361,-0.3560,0.1852
Total Sales,-5.985e-06,3.884e-06,-1.5409,0.1233,-1.36e-05,1.628e-06
Firm Size,-0.0013,0.0012,-1.1030,0.2701,-0.0037,0.0010
Market Share,-0.1544,0.1141,-1.3534,0.1759,-0.3781,0.0692
EPS,0.0147,0.0020,7.5004,0.0000,0.0109,0.0186
Available Slack,-0.0028,0.0018,-1.5175,0.1291,-0.0064,0.0008


## Moderation Effect of Category Coherence

In [49]:
# Regressions with interaction terms - analyst coverage and category coherence
interaction_term = 'Exemplar Similarity x Category Coherence'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Coherence']
regressors = iv + moderators + [interaction_term] + controls

model3 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res3 = model3.fit(cov_type='clustered', cluster_entity=True)
res3


0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1314
Estimator:,PanelOLS,R-squared (Between):,0.2270
No. Observations:,51216,R-squared (Within):,0.1314
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.1985
Time:,16:12:48,Log-likelihood,-1.163e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,249.45
Entities:,8324,P-value,0.0000
Avg Obs:,6.1528,Distribution:,"F(26,42866)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-49.283,6.9747,-7.0660,0.0000,-62.954,-35.613
Category Coherence,-30.223,6.8702,-4.3992,0.0000,-43.689,-16.758
Category Distinctiveness,-3.2337,1.8090,-1.7875,0.0739,-6.7794,0.3120
Exemplar Typicality,-1.8007,0.6558,-2.7459,0.0060,-3.0860,-0.5154
Exemplar Similarity x Category Coherence,59.287,8.0544,7.3608,0.0000,43.500,75.074
Total Sales,0.0002,4.65e-05,3.2798,0.0010,6.137e-05,0.0002
Firm Size,0.0063,0.0071,0.8917,0.3726,-0.0076,0.0203
Market Share,1.1661,1.4709,0.7928,0.4279,-1.7169,4.0491
EPS,0.0797,0.0149,5.3518,0.0000,0.0505,0.1089


In [50]:
interaction_term = 'Exemplar Similarity x Category Coherence'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Coherence']
regressors = iv + moderators + [interaction_term] + controls

model4 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res4 = model4.fit(cov_type='clustered', cluster_entity=True)
res4

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0557
Estimator:,PanelOLS,R-squared (Between):,0.0380
No. Observations:,33992,R-squared (Within):,0.0557
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.0430
Time:,16:12:48,Log-likelihood,-1.411e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,62.037
Entities:,6598,P-value,0.0000
Avg Obs:,5.1519,Distribution:,"F(26,27368)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,2.3267,1.4273,1.6301,0.1031,-0.4709,5.1244
Category Coherence,1.2971,1.4101,0.9199,0.3577,-1.4667,4.0609
Category Distinctiveness,-0.2943,0.2482,-1.1858,0.2357,-0.7808,0.1922
Exemplar Typicality,-0.0717,0.1373,-0.5220,0.6016,-0.3408,0.1975
Exemplar Similarity x Category Coherence,-2.8945,1.6112,-1.7965,0.0724,-6.0526,0.2635
Total Sales,-5.872e-06,3.879e-06,-1.5137,0.1301,-1.348e-05,1.732e-06
Firm Size,-0.0013,0.0012,-1.1042,0.2695,-0.0037,0.0010
Market Share,-0.1539,0.1141,-1.3485,0.1775,-0.3777,0.0698
EPS,0.0147,0.0020,7.4978,0.0000,0.0109,0.0185


## Moderation Effect of Category Distinctiveness

In [51]:
# Regressions with interaction terms - analyst coverage and category coherence
interaction_term = 'Exemplar Similarity x Category Distinctiveness'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Distinctiveness']
regressors = iv + moderators + [interaction_term] + controls

model5 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res5 = model5.fit(cov_type='clustered', cluster_entity=True)
res5


0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1306
Estimator:,PanelOLS,R-squared (Between):,0.2359
No. Observations:,51216,R-squared (Within):,0.1306
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.2068
Time:,16:12:48,Log-likelihood,-1.163e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,247.76
Entities:,8324,P-value,0.0000
Avg Obs:,6.1528,Distribution:,"F(26,42866)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,34.376,5.0579,6.7964,0.0000,24.462,44.289
Category Coherence,19.240,2.1595,8.9098,0.0000,15.008,23.473
Category Distinctiveness,-36.713,5.5175,-6.6540,0.0000,-47.528,-25.899
Exemplar Typicality,-1.0551,0.6679,-1.5798,0.1142,-2.3642,0.2540
Exemplar Similarity x Category Distinctiveness,39.149,6.1477,6.3680,0.0000,27.099,51.198
Total Sales,0.0002,4.697e-05,3.2656,0.0011,6.132e-05,0.0002
Firm Size,0.0063,0.0071,0.8794,0.3792,-0.0077,0.0202
Market Share,1.2383,1.4771,0.8383,0.4019,-1.6569,4.1334
EPS,0.0796,0.0149,5.3369,0.0000,0.0503,0.1088


In [52]:
interaction_term = 'Exemplar Similarity x Category Distinctiveness'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Distinctiveness']
regressors = iv + moderators + [interaction_term] + controls

model6 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res6 = model6.fit(cov_type='clustered', cluster_entity=True)
res6

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0557
Estimator:,PanelOLS,R-squared (Between):,0.0266
No. Observations:,33992,R-squared (Within):,0.0557
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.0336
Time:,16:12:49,Log-likelihood,-1.411e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,62.113
Entities:,6598,P-value,0.0000
Avg Obs:,5.1519,Distribution:,"F(26,27368)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-2.2743,1.0070,-2.2584,0.0239,-4.2481,-0.3005
Category Coherence,-1.1669,0.3751,-3.1110,0.0019,-1.9022,-0.4317
Category Distinctiveness,1.8976,1.0760,1.7635,0.0778,-0.2115,4.0067
Exemplar Typicality,-0.1057,0.1390,-0.7608,0.4468,-0.3781,0.1666
Exemplar Similarity x Category Distinctiveness,-2.5182,1.2321,-2.0437,0.0410,-4.9332,-0.1031
Total Sales,-5.852e-06,3.872e-06,-1.5111,0.1308,-1.344e-05,1.738e-06
Firm Size,-0.0013,0.0012,-1.1087,0.2675,-0.0037,0.0010
Market Share,-0.1558,0.1136,-1.3713,0.1703,-0.3785,0.0669
EPS,0.0147,0.0020,7.4697,0.0000,0.0108,0.0185


## Moderation Effect of Exemplar Typicality

In [53]:
# Regressions with interaction terms - analyst coverage and category coherence
interaction_term = 'Exemplar Similarity x Exemplar Typicality'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Exemplar Typicality']
regressors = iv + moderators + [interaction_term] + controls

model7 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res7 = model7.fit(cov_type='clustered', cluster_entity=True)
res7

0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1297
Estimator:,PanelOLS,R-squared (Between):,0.2314
No. Observations:,51216,R-squared (Within):,0.1297
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.2029
Time:,16:12:49,Log-likelihood,-1.164e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,245.77
Entities:,8324,P-value,0.0000
Avg Obs:,6.1528,Distribution:,"F(26,42866)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-15.743,3.3766,-4.6624,0.0000,-22.361,-9.1250
Category Coherence,19.406,2.1568,8.9977,0.0000,15.179,23.633
Category Distinctiveness,-3.3094,1.8156,-1.8228,0.0683,-6.8679,0.2491
Exemplar Typicality,-18.429,3.2120,-5.7376,0.0000,-24.725,-12.134
Exemplar Similarity x Exemplar Typicality,21.104,4.0127,5.2592,0.0000,13.239,28.968
Total Sales,0.0002,4.675e-05,3.2920,0.0010,6.227e-05,0.0002
Firm Size,0.0062,0.0071,0.8666,0.3862,-0.0078,0.0201
Market Share,1.2263,1.4721,0.8330,0.4048,-1.6590,4.1116
EPS,0.0793,0.0149,5.3148,0.0000,0.0501,0.1086


In [54]:
interaction_term = 'Exemplar Similarity x Exemplar Typicality'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Exemplar Typicality']
regressors = iv + moderators + [interaction_term] + controls

model8 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res8 = model8.fit(cov_type='clustered', cluster_entity=True)
res8

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0560
Estimator:,PanelOLS,R-squared (Between):,0.0382
No. Observations:,33992,R-squared (Within):,0.0560
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.0431
Time:,16:12:50,Log-likelihood,-1.41e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,62.498
Entities:,6598,P-value,0.0000
Avg Obs:,5.1519,Distribution:,"F(26,27368)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,2.1124,0.6853,3.0827,0.0021,0.7693,3.4556
Category Coherence,-1.1686,0.3747,-3.1186,0.0018,-1.9031,-0.4341
Category Distinctiveness,-0.2722,0.2489,-1.0935,0.2742,-0.7601,0.2157
Exemplar Typicality,2.1142,0.6527,3.2389,0.0012,0.8348,3.3936
Exemplar Similarity x Exemplar Typicality,-2.6896,0.7793,-3.4512,0.0006,-4.2171,-1.1621
Total Sales,-5.849e-06,3.897e-06,-1.5009,0.1334,-1.349e-05,1.789e-06
Firm Size,-0.0013,0.0012,-1.1011,0.2708,-0.0037,0.0010
Market Share,-0.1576,0.1143,-1.3788,0.1680,-0.3816,0.0664
EPS,0.0147,0.0020,7.5218,0.0000,0.0109,0.0186


# Output regression results

In [55]:
def create_table(*results):
    table = pd.DataFrame()
    for i, result in enumerate(results, start=1):
        coefficients = result.params
        pvalues = result.pvalues
        nobs = result.nobs
        rsquared = "{:.3f}".format(result.rsquared)
        nentities = result._entity_info['total']

        # Create a DataFrame for this result
        data = {}
        for key in coefficients.keys():
            data[key] = ["{:.3f}".format(coefficients[key])]
            data[key + ' p-val'] = ["'({:.3f})".format(pvalues[key])]  # added single quote
        data['Observations'] = [nobs]
        data['R-squared'] = [rsquared]
        data['Entities'] = [nentities]
        df = pd.DataFrame(data)

        # Assign model name to the index
        df.index = ['Model' + str(i)]

        # Concatenate to the final table
        table = pd.concat([table, df], axis=0)
    return table

In [56]:

# Now create the table with your results:
table = create_table(res1, res2, res3, res4, res5, res6, res7, res8)


In [57]:
table.T.to_csv(f'../data/tables/main_regression_results_naicsh_4min.csv')