# Import the libraries

In [1]:
import pandas as pd
from linearmodels import PanelOLS
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

# Set the font to Times New Roman
matplotlib.rcParams['font.family'] = 'serif'
matplotlib.rcParams['font.serif'] = 'Times New Roman'

# Prepare the data

In [2]:
ind_col = 'naicsh3'
nlp_model = 'mpnet'
doc2vec_vector_size = 768 # 768 for mpnet, 1536 for ada

data = pd.read_pickle(f'../data/all_reg_vars_{nlp_model}_{str(doc2vec_vector_size)}_{ind_col}.pkl')
data.replace([np.inf, -np.inf], np.nan, inplace=True)


### Create lead variables for the DVs

In [3]:
# Sort data by GVKEY and year
reg_data = data.sort_values(by=['GVKEY', 'year'])
# Lead dependent variables by one year
reg_data['numrec_mean_0_lead'] = reg_data.groupby('GVKEY')['numrec_mean_0'].shift(-1)
reg_data['meanrec_mean_descend_lead'] = reg_data.groupby('GVKEY')['meanrec_mean_descend'].shift(-1)
reg_data['year_lead'] = reg_data.groupby('GVKEY')['year'].shift(-1)
reg_data['year_diff'] = reg_data['year_lead'] - reg_data['year']
reg_data = reg_data[reg_data['year_diff'] <=2 ]
reg_data['constant'] = 1

In [4]:
# Limit the data to non-exemplar firms and industries with at least 10 firms
reg_data = reg_data[(reg_data[f'exemplar_{ind_col}'] == 0) & (reg_data[f'no_firms_{ind_col}'] > 9)]

In [5]:
reg_data['year_lead'] = reg_data.groupby('GVKEY')['year'].shift(-1)


### Rename the variables to make them more readable

In [6]:
# define a dictionary that maps old names to new names
rename_dict = {
    'numrec_mean_0_lead': 'Analyst Coverage',
    'meanrec_mean_descend_lead': 'Analyst Recoms',
    f'exemplar_sim_{ind_col}': 'Exemplar Similarity',
    'sale_wins_1': 'Total Sales',
    'n_emp': 'Firm Size',
    'EPS_wins_1': 'EPS',
    'slack_avail_wins_1': 'Available Slack',
    'rd_f_wins_1': 'R&D Expenditure',
    'adv_f_wins_1': 'Advertising Expenditure',
    'dpt_f_wins_1': 'Depreciation Ratio',
    'intang_f_wins_1': 'Intangible Assets Ratio',
    'n_segments': 'No. Segments',
    f'market_share_{ind_col}': 'Market Share',
    'mergers_wins_1': 'Mergers (expenditure)',
    'leverage_wins_1': 'Financial Leverage',
    'is_spx500': 'S&P500 Dummy',
    f'sim_mean_{ind_col}': 'Firm Typicality',
    'strong_weak_modal_ratio': 'Strong-Weak Modals Ratio',
    'positive_negative_ratio': 'Positive-Negative Words Ratio',
    'N_Litigious': 'Litigous Words Ratio',
    f'n_analysts_{ind_col}': 'No. Analysts in Industry',
    f'no_firms_{ind_col}': 'No. Firms in Industry',
    f'category_coherence_{ind_col}': 'Category Coherence',
    f'numrec_avg_year_{ind_col}': 'Average Coverage (Year-Ind)',
    f'meanrec_avg_year_{ind_col}': 'Average Recoms (Year-Ind)',
    f'ind_sim_all_{ind_col}_distinct': 'Category Distinctiveness',
    f'ind_vecs_change_{ind_col}': 'Category Instability',
    f'hhi_{ind_col}': 'Industry HHI',
    f'EPS_wins_1_ex_{ind_col}': 'Exemplar EPS',
    f'exemplar_{ind_col}_typicality': 'Exemplar Typicality',
    'constant': 'Constant'
}

# rename columns
reg_data = reg_data.rename(columns=rename_dict)


### Define the list of independent/control variables


In [7]:
iv = ['Exemplar Similarity']
moderators = ['Category Coherence', 'Category Distinctiveness','Exemplar Typicality']
controls = [
            'Total Sales', 'Firm Size','Market Share', 'EPS', 'Available Slack', 'R&D Expenditure',
            'Advertising Expenditure', 'Intangible Assets Ratio', 'Depreciation Ratio', 
            'Firm Typicality',  'No. Segments',
             'Mergers (expenditure)', 'Financial Leverage', 'S&P500 Dummy', 
             'No. Analysts in Industry', 'No. Firms in Industry',  
             'Average Coverage (Year-Ind)', 'Average Recoms (Year-Ind)',
             'Category Instability', 'Industry HHI', 'Exemplar EPS',
             'Constant']
dv = ['Analyst Coverage', 'Analyst Recoms']

In [8]:
# Set index to GVKEY and year
reg_data = reg_data.set_index(['GVKEY', 'year'])

# Regression Models

## Effect of Exemplar similarity on Analyst Coverage

In [9]:
regressors = iv + moderators + controls
model1 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res1 = model1.fit(cov_type='clustered', cluster_entity=True)
res1


0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1551
Estimator:,PanelOLS,R-squared (Between):,0.2393
No. Observations:,55370,R-squared (Within):,0.1551
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.2313
Time:,15:37:45,Log-likelihood,-1.299e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,344.66
Entities:,8401,P-value,0.0000
Avg Obs:,6.5909,Distribution:,"F(25,46944)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,1.9689,0.4809,4.0944,0.0000,1.0264,2.9115
Category Coherence,28.536,2.9356,9.7204,0.0000,22.782,34.289
Category Distinctiveness,10.948,1.5158,7.2225,0.0000,7.9769,13.919
Exemplar Typicality,-0.6912,0.4843,-1.4271,0.1536,-1.6405,0.2581
Total Sales,0.0002,4.76e-05,5.2366,0.0000,0.0002,0.0003
Firm Size,-0.0183,0.0092,-1.9847,0.0472,-0.0364,-0.0002
Market Share,-4.5567,10.326,-0.4413,0.6590,-24.795,15.681
EPS,0.1061,0.0151,7.0052,0.0000,0.0764,0.1358
Available Slack,0.0037,0.0099,0.3713,0.7104,-0.0157,0.0231


## Effect of Exemplar Similarity on Analyst Recommendations

In [10]:
regressors = iv + moderators + controls
model2 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res2 = model2.fit(cov_type='clustered', cluster_entity=True)
res2

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0451
Estimator:,PanelOLS,R-squared (Between):,-0.0431
No. Observations:,38117,R-squared (Within):,0.0451
Date:,"Tue, Jun 20 2023",R-squared (Overall):,-0.0078
Time:,15:37:46,Log-likelihood,-1.557e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,59.251
Entities:,6718,P-value,0.0000
Avg Obs:,5.6739,Distribution:,"F(25,31374)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-0.0140,0.0626,-0.2243,0.8225,-0.1368,0.1087
Category Coherence,-2.1799,0.4534,-4.8075,0.0000,-3.0687,-1.2911
Category Distinctiveness,-1.3937,0.2350,-5.9314,0.0000,-1.8543,-0.9332
Exemplar Typicality,0.1626,0.0691,2.3538,0.0186,0.0272,0.2980
Total Sales,-5.456e-06,2.93e-06,-1.8625,0.0625,-1.12e-05,2.858e-07
Firm Size,-0.0007,0.0006,-1.2167,0.2237,-0.0019,0.0004
Market Share,0.7023,0.6243,1.1250,0.2606,-0.5213,1.9260
EPS,0.0161,0.0019,8.4607,0.0000,0.0123,0.0198
Available Slack,-0.0029,0.0017,-1.6571,0.0975,-0.0063,0.0005


## Moderation Effect of Category Coherence

In [11]:
# Regressions with interaction terms - analyst coverage and category coherence
interaction_term = 'Exemplar Similarity x Category Coherence'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Coherence']
regressors = iv + moderators + [interaction_term] + controls

model3 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res3 = model3.fit(cov_type='clustered', cluster_entity=True)
res3


0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1554
Estimator:,PanelOLS,R-squared (Between):,0.2377
No. Observations:,55370,R-squared (Within):,0.1554
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.2296
Time:,15:37:46,Log-likelihood,-1.299e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,332.14
Entities:,8401,P-value,0.0000
Avg Obs:,6.5909,Distribution:,"F(26,46943)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-22.036,10.262,-2.1473,0.0318,-42.150,-1.9222
Category Coherence,5.7636,9.6971,0.5944,0.5523,-13.243,24.770
Category Distinctiveness,10.915,1.5159,7.2002,0.0000,7.9438,13.886
Exemplar Typicality,-0.9623,0.4900,-1.9639,0.0495,-1.9226,-0.0019
Exemplar Similarity x Category Coherence,27.619,11.817,2.3372,0.0194,4.4577,50.780
Total Sales,0.0002,4.765e-05,5.2431,0.0000,0.0002,0.0003
Firm Size,-0.0183,0.0092,-1.9802,0.0477,-0.0364,-0.0002
Market Share,-4.9870,10.302,-0.4841,0.6283,-25.180,15.206
EPS,0.1064,0.0151,7.0248,0.0000,0.0767,0.1361


In [12]:
interaction_term = 'Exemplar Similarity x Category Coherence'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Coherence']
regressors = iv + moderators + [interaction_term] + controls

model4 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res4 = model4.fit(cov_type='clustered', cluster_entity=True)
res4

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0451
Estimator:,PanelOLS,R-squared (Between):,-0.0456
No. Observations:,38117,R-squared (Within):,0.0451
Date:,"Tue, Jun 20 2023",R-squared (Overall):,-0.0101
Time:,15:37:47,Log-likelihood,-1.557e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,57.031
Entities:,6718,P-value,0.0000
Avg Obs:,5.6739,Distribution:,"F(26,31373)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-1.4092,1.6352,-0.8618,0.3888,-4.6143,1.7959
Category Coherence,-3.5208,1.5677,-2.2459,0.0247,-6.5936,-0.4481
Category Distinctiveness,-1.3944,0.2349,-5.9361,0.0000,-1.8548,-0.9340
Exemplar Typicality,0.1445,0.0732,1.9745,0.0483,0.0011,0.2879
Exemplar Similarity x Category Coherence,1.6051,1.8866,0.8508,0.3949,-2.0927,5.3030
Total Sales,-5.411e-06,2.922e-06,-1.8518,0.0641,-1.114e-05,3.162e-07
Firm Size,-0.0007,0.0006,-1.2100,0.2263,-0.0019,0.0004
Market Share,0.6741,0.6315,1.0676,0.2857,-0.5635,1.9118
EPS,0.0161,0.0019,8.4638,0.0000,0.0123,0.0198


## Moderation Effect of Category Distinctiveness

In [13]:
# Regressions with interaction terms - analyst coverage and category coherence
interaction_term = 'Exemplar Similarity x Category Distinctiveness'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Distinctiveness']
regressors = iv + moderators + [interaction_term] + controls

model5 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res5 = model5.fit(cov_type='clustered', cluster_entity=True)
res5


0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1563
Estimator:,PanelOLS,R-squared (Between):,0.2407
No. Observations:,55370,R-squared (Within):,0.1563
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.2334
Time:,15:37:47,Log-likelihood,-1.299e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,334.55
Entities:,8401,P-value,0.0000
Avg Obs:,6.5909,Distribution:,"F(26,46943)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,25.990,4.9826,5.2161,0.0000,16.224,35.756
Category Coherence,26.582,2.9680,8.9563,0.0000,20.765,32.400
Category Distinctiveness,-12.518,4.7226,-2.6506,0.0080,-21.774,-3.2612
Exemplar Typicality,-0.2856,0.4970,-0.5747,0.5655,-1.2598,0.6885
Exemplar Similarity x Category Distinctiveness,28.023,5.8560,4.7854,0.0000,16.546,39.501
Total Sales,0.0003,4.75e-05,5.2650,0.0000,0.0002,0.0003
Firm Size,-0.0181,0.0092,-1.9629,0.0497,-0.0361,-2.65e-05
Market Share,-4.5659,10.283,-0.4440,0.6570,-24.720,15.589
EPS,0.1070,0.0151,7.0741,0.0000,0.0773,0.1366


In [14]:
interaction_term = 'Exemplar Similarity x Category Distinctiveness'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Distinctiveness']
regressors = iv + moderators + [interaction_term] + controls

model6 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res6 = model6.fit(cov_type='clustered', cluster_entity=True)
res6

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0451
Estimator:,PanelOLS,R-squared (Between):,-0.0454
No. Observations:,38117,R-squared (Within):,0.0451
Date:,"Tue, Jun 20 2023",R-squared (Overall):,-0.0092
Time:,15:37:47,Log-likelihood,-1.557e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,56.990
Entities:,6718,P-value,0.0000
Avg Obs:,5.6739,Distribution:,"F(26,31373)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-0.3821,0.7071,-0.5404,0.5889,-1.7682,1.0039
Category Coherence,-2.1479,0.4602,-4.6673,0.0000,-3.0500,-1.2459
Category Distinctiveness,-1.0317,0.7228,-1.4274,0.1535,-2.4484,0.3850
Exemplar Typicality,0.1560,0.0700,2.2274,0.0259,0.0187,0.2933
Exemplar Similarity x Category Distinctiveness,-0.4302,0.8259,-0.5209,0.6025,-2.0491,1.1887
Total Sales,-5.483e-06,2.932e-06,-1.8699,0.0615,-1.123e-05,2.644e-07
Firm Size,-0.0007,0.0006,-1.2204,0.2223,-0.0019,0.0004
Market Share,0.7043,0.6225,1.1314,0.2579,-0.5158,1.9243
EPS,0.0160,0.0019,8.4466,0.0000,0.0123,0.0198


## Moderation Effect of Exemplar Typicality

In [15]:
# Regressions with interaction terms - analyst coverage and category coherence
interaction_term = 'Exemplar Similarity x Exemplar Typicality'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Exemplar Typicality']
regressors = iv + moderators + [interaction_term] + controls

model7 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res7 = model7.fit(cov_type='clustered', cluster_entity=True)
res7

0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1555
Estimator:,PanelOLS,R-squared (Between):,0.2391
No. Observations:,55370,R-squared (Within):,0.1555
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.2309
Time:,15:37:48,Log-likelihood,-1.299e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,332.42
Entities:,8401,P-value,0.0000
Avg Obs:,6.5909,Distribution:,"F(26,46943)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-9.2704,3.6373,-2.5487,0.0108,-16.400,-2.1412
Category Coherence,28.651,2.9190,9.8151,0.0000,22.929,34.372
Category Distinctiveness,10.658,1.5063,7.0756,0.0000,7.7058,13.611
Exemplar Typicality,-10.601,3.1689,-3.3452,0.0008,-16.812,-4.3895
Exemplar Similarity x Exemplar Typicality,12.908,4.1299,3.1254,0.0018,4.8130,21.002
Total Sales,0.0003,4.755e-05,5.2594,0.0000,0.0002,0.0003
Firm Size,-0.0184,0.0092,-1.9918,0.0464,-0.0365,-0.0003
Market Share,-4.8982,10.300,-0.4756,0.6344,-25.086,15.289
EPS,0.1059,0.0151,7.0011,0.0000,0.0763,0.1356


In [16]:
interaction_term = 'Exemplar Similarity x Exemplar Typicality'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Exemplar Typicality']
regressors = iv + moderators + [interaction_term] + controls

model8 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res8 = model8.fit(cov_type='clustered', cluster_entity=True)
res8

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0451
Estimator:,PanelOLS,R-squared (Between):,-0.0432
No. Observations:,38117,R-squared (Within):,0.0451
Date:,"Tue, Jun 20 2023",R-squared (Overall):,-0.0077
Time:,15:37:48,Log-likelihood,-1.557e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,56.978
Entities:,6718,P-value,0.0000
Avg Obs:,5.6739,Distribution:,"F(26,31373)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,0.1706,0.5448,0.3132,0.7541,-0.8972,1.2385
Category Coherence,-2.1803,0.4534,-4.8086,0.0000,-3.0690,-1.2916
Category Distinctiveness,-1.3893,0.2346,-5.9226,0.0000,-1.8491,-0.9295
Exemplar Typicality,0.3270,0.4816,0.6791,0.4971,-0.6169,1.2709
Exemplar Similarity x Exemplar Typicality,-0.2125,0.6229,-0.3411,0.7330,-1.4335,1.0085
Total Sales,-5.474e-06,2.93e-06,-1.8681,0.0618,-1.122e-05,2.695e-07
Firm Size,-0.0007,0.0006,-1.2140,0.2247,-0.0019,0.0004
Market Share,0.7078,0.6240,1.1343,0.2567,-0.5152,1.9309
EPS,0.0161,0.0019,8.4621,0.0000,0.0123,0.0198


# Output regression results

In [17]:
def create_table(*results):
    table = pd.DataFrame()
    for i, result in enumerate(results, start=1):
        coefficients = result.params
        pvalues = result.pvalues
        nobs = result.nobs
        rsquared = "{:.3f}".format(result.rsquared)
        nentities = result._entity_info['total']

        # Create a DataFrame for this result
        data = {}
        for key in coefficients.keys():
            data[key] = ["{:.3f}".format(coefficients[key])]
            data[key + ' p-val'] = ["'({:.3f})".format(pvalues[key])]  # added single quote
        data['Observations'] = [nobs]
        data['R-squared'] = [rsquared]
        data['Entities'] = [nentities]
        df = pd.DataFrame(data)

        # Assign model name to the index
        df.index = ['Model' + str(i)]

        # Concatenate to the final table
        table = pd.concat([table, df], axis=0)
    return table

In [18]:

# Now create the table with your results:
table = create_table(res1, res2, res3, res4, res5, res6, res7, res8)


In [19]:
table.T.to_csv(f'../data/tables/main_regression_results_naicsh3.csv')