# Import the libraries

In [1]:
import pandas as pd
from linearmodels import PanelOLS
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

# Set the font to Times New Roman
matplotlib.rcParams['font.family'] = 'serif'
matplotlib.rcParams['font.serif'] = 'Times New Roman'

# Prepare the data

In [2]:
ind_col = 'naicsh4'
nlp_model = 'mpnet'
doc2vec_vector_size = 768 # 768 for mpnet, 1536 for ada

data = pd.read_pickle(f'../data/all_reg_vars_{nlp_model}_{str(doc2vec_vector_size)}_{ind_col}.pkl')
data.replace([np.inf, -np.inf], np.nan, inplace=True)


### Create lead variables for the DVs

In [3]:
# Sort data by GVKEY and year
reg_data = data.sort_values(by=['GVKEY', 'year'])
# Lead dependent variables by one year
reg_data['numrec_mean_0_lead'] = reg_data.groupby('GVKEY')['numrec_mean_0'].shift(-1)
reg_data['meanrec_mean_descend_lead'] = reg_data.groupby('GVKEY')['meanrec_mean_descend'].shift(-1)
reg_data['year_lead'] = reg_data.groupby('GVKEY')['year'].shift(-1)
reg_data['year_diff'] = reg_data['year_lead'] - reg_data['year']
reg_data = reg_data[reg_data['year_diff'] <=2 ]
reg_data['constant'] = 1

In [4]:
# Limit the data to non-exemplar firms and industries with at least 10 firms
reg_data = reg_data[(reg_data[f'exemplar_{ind_col}'] == 0) & (reg_data[f'no_firms_{ind_col}'] > 9)]

In [5]:
reg_data['year_lead'] = reg_data.groupby('GVKEY')['year'].shift(-1)


### Rename the variables to make them more readable

In [6]:
# define a dictionary that maps old names to new names
rename_dict = {
    'numrec_mean_0_lead': 'Analyst Coverage',
    'meanrec_mean_descend_lead': 'Analyst Recoms',
    f'exemplar_sim_{ind_col}': 'Exemplar Similarity',
    'sale_wins_1': 'Total Sales',
    'n_emp': 'Firm Size',
    'EPS_wins_1': 'EPS',
    'slack_avail_wins_1': 'Available Slack',
    'rd_f_wins_1': 'R&D Expenditure',
    'adv_f_wins_1': 'Advertising Expenditure',
    'dpt_f_wins_1': 'Depreciation Ratio',
    'intang_f_wins_1': 'Intangible Assets Ratio',
    'n_segments': 'No. Segments',
    f'market_share_{ind_col}': 'Market Share',
    'mergers_wins_1': 'Mergers (expenditure)',
    'leverage_wins_1': 'Financial Leverage',
    'is_spx500': 'S&P500 Dummy',
    f'sim_mean_{ind_col}': 'Firm Typicality',
    'strong_weak_modal_ratio': 'Strong-Weak Modals Ratio',
    'positive_negative_ratio': 'Positive-Negative Words Ratio',
    'N_Litigious': 'Litigous Words Ratio',
    f'n_analysts_{ind_col}': 'No. Analysts in Industry',
    f'no_firms_{ind_col}': 'No. Firms in Industry',
    f'category_coherence_{ind_col}': 'Category Coherence',
    f'numrec_avg_year_{ind_col}': 'Average Coverage (Year-Ind)',
    f'meanrec_avg_year_{ind_col}': 'Average Recoms (Year-Ind)',
    f'ind_sim_all_{ind_col}_distinct': 'Category Distinctiveness',
    f'ind_vecs_change_{ind_col}': 'Category Instability',
    f'hhi_{ind_col}': 'Industry HHI',
    f'EPS_wins_1_ex_{ind_col}': 'Exemplar EPS',
    f'exemplar_{ind_col}_typicality': 'Exemplar Typicality',
    'constant': 'Constant'
}

# rename columns
reg_data = reg_data.rename(columns=rename_dict)


### Define the list of independent/control variables


In [7]:
iv = ['Exemplar Similarity']
moderators = ['Category Coherence', 'Category Distinctiveness','Exemplar Typicality']
controls = [
            'Total Sales', 'Firm Size','Market Share', 'EPS', 'Available Slack', 'R&D Expenditure',
            'Advertising Expenditure', 'Intangible Assets Ratio', 'Depreciation Ratio', 
            'Firm Typicality',  'No. Segments',
             'Mergers (expenditure)', 'Financial Leverage', 'S&P500 Dummy', 
             'No. Analysts in Industry', 'No. Firms in Industry',  
             'Average Coverage (Year-Ind)', 'Average Recoms (Year-Ind)',
             'Category Instability', 'Industry HHI', 'Exemplar EPS',
             'Constant']
dv = ['Analyst Coverage', 'Analyst Recoms']

In [8]:
# Set index to GVKEY and year
reg_data = reg_data.set_index(['GVKEY', 'year'])

# Regression Models

## Effect of Exemplar similarity on Analyst Coverage

In [9]:
regressors = iv + moderators + controls
model1 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res1 = model1.fit(cov_type='clustered', cluster_entity=True)
res1


0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1265
Estimator:,PanelOLS,R-squared (Between):,0.2111
No. Observations:,62166,R-squared (Within):,0.1265
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.1806
Time:,15:37:28,Log-likelihood,-1.402e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,306.31
Entities:,9283,P-value,0.0000
Avg Obs:,6.6968,Distribution:,"F(25,52858)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,2.5036,0.4556,5.4953,0.0000,1.6106,3.3965
Category Coherence,18.108,2.2223,8.1483,0.0000,13.752,22.463
Category Distinctiveness,3.9828,1.3585,2.9318,0.0034,1.3202,6.6455
Exemplar Typicality,-1.4512,0.5251,-2.7638,0.0057,-2.4804,-0.4220
Total Sales,0.0002,4.238e-05,4.1697,0.0000,9.364e-05,0.0003
Firm Size,-0.0024,0.0089,-0.2667,0.7897,-0.0198,0.0151
Market Share,0.3499,2.2869,0.1530,0.8784,-4.1323,4.8322
EPS,0.0913,0.0127,7.2076,0.0000,0.0664,0.1161
Available Slack,0.0106,0.0079,1.3471,0.1780,-0.0048,0.0261


## Effect of Exemplar Similarity on Analyst Recommendations

In [10]:
regressors = iv + moderators + controls
model2 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res2 = model2.fit(cov_type='clustered', cluster_entity=True)
res2

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0522
Estimator:,PanelOLS,R-squared (Between):,-0.0228
No. Observations:,40657,R-squared (Within):,0.0522
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.0239
Time:,15:37:29,Log-likelihood,-1.821e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,73.365
Entities:,7342,P-value,0.0000
Avg Obs:,5.5376,Distribution:,"F(25,33290)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-0.2780,0.0914,-3.0431,0.0023,-0.4571,-0.0989
Category Coherence,-1.6086,0.3755,-4.2836,0.0000,-2.3446,-0.8725
Category Distinctiveness,-0.6739,0.2465,-2.7334,0.0063,-1.1572,-0.1907
Exemplar Typicality,-0.0118,0.1121,-0.1049,0.9164,-0.2316,0.2080
Total Sales,-1.103e-05,2.428e-06,-4.5418,0.0000,-1.579e-05,-6.269e-06
Firm Size,-0.0016,0.0011,-1.4876,0.1369,-0.0038,0.0005
Market Share,-0.2130,0.2132,-0.9990,0.3178,-0.6310,0.2049
EPS,0.0165,0.0018,8.9334,0.0000,0.0129,0.0201
Available Slack,-0.0042,0.0017,-2.4079,0.0160,-0.0076,-0.0008


## Moderation Effect of Category Coherence

In [11]:
# Regressions with interaction terms - analyst coverage and category coherence
interaction_term = 'Exemplar Similarity x Category Coherence'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Coherence']
regressors = iv + moderators + [interaction_term] + controls

model3 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res3 = model3.fit(cov_type='clustered', cluster_entity=True)
res3


0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1293
Estimator:,PanelOLS,R-squared (Between):,0.2033
No. Observations:,62166,R-squared (Within):,0.1293
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.1740
Time:,15:37:30,Log-likelihood,-1.401e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,301.88
Entities:,9283,P-value,0.0000
Avg Obs:,6.6968,Distribution:,"F(26,52857)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-51.325,7.1455,-7.1828,0.0000,-65.330,-37.320
Category Coherence,-33.174,6.9401,-4.7801,0.0000,-46.777,-19.572
Category Distinctiveness,3.3267,1.3506,2.4632,0.0138,0.6796,5.9738
Exemplar Typicality,-1.4874,0.5193,-2.8644,0.0042,-2.5051,-0.4696
Exemplar Similarity x Category Coherence,62.240,8.3186,7.4821,0.0000,45.936,78.545
Total Sales,0.0002,4.191e-05,4.1463,0.0000,9.162e-05,0.0003
Firm Size,-0.0021,0.0089,-0.2338,0.8152,-0.0195,0.0153
Market Share,0.3240,2.2601,0.1434,0.8860,-4.1057,4.7538
EPS,0.0919,0.0126,7.3036,0.0000,0.0672,0.1166


In [12]:
interaction_term = 'Exemplar Similarity x Category Coherence'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Coherence']
regressors = iv + moderators + [interaction_term] + controls

model4 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res4 = model4.fit(cov_type='clustered', cluster_entity=True)
res4

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0525
Estimator:,PanelOLS,R-squared (Between):,-0.0193
No. Observations:,40657,R-squared (Within):,0.0525
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.0284
Time:,15:37:30,Log-likelihood,-1.82e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,70.994
Entities:,7342,P-value,0.0000
Avg Obs:,5.5376,Distribution:,"F(26,33289)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,3.1254,1.3259,2.3573,0.0184,0.5267,5.7241
Category Coherence,1.6916,1.3082,1.2931,0.1960,-0.8724,4.2557
Category Distinctiveness,-0.6522,0.2461,-2.6498,0.0081,-1.1346,-0.1698
Exemplar Typicality,-0.0067,0.1119,-0.0596,0.9524,-0.2261,0.2127
Exemplar Similarity x Category Coherence,-3.9113,1.5152,-2.5814,0.0098,-6.8812,-0.9415
Total Sales,-1.094e-05,2.414e-06,-4.5323,0.0000,-1.567e-05,-6.209e-06
Firm Size,-0.0016,0.0011,-1.4978,0.1342,-0.0038,0.0005
Market Share,-0.2045,0.2126,-0.9617,0.3362,-0.6212,0.2123
EPS,0.0164,0.0018,8.9233,0.0000,0.0128,0.0200


## Moderation Effect of Category Distinctiveness

In [13]:
# Regressions with interaction terms - analyst coverage and category coherence
interaction_term = 'Exemplar Similarity x Category Distinctiveness'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Distinctiveness']
regressors = iv + moderators + [interaction_term] + controls

model5 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res5 = model5.fit(cov_type='clustered', cluster_entity=True)
res5


0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1282
Estimator:,PanelOLS,R-squared (Between):,0.2112
No. Observations:,62166,R-squared (Within):,0.1282
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.1816
Time:,15:37:31,Log-likelihood,-1.401e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,298.94
Entities:,9283,P-value,0.0000
Avg Obs:,6.6968,Distribution:,"F(26,52857)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,32.069,4.5079,7.1139,0.0000,23.233,40.904
Category Coherence,17.468,2.2217,7.8626,0.0000,13.114,21.823
Category Distinctiveness,-26.326,4.5301,-5.8114,0.0000,-35.205,-17.447
Exemplar Typicality,-1.2926,0.5267,-2.4543,0.0141,-2.3250,-0.2603
Exemplar Similarity x Category Distinctiveness,35.013,5.3342,6.5640,0.0000,24.558,45.468
Total Sales,0.0002,4.251e-05,4.1197,0.0000,9.181e-05,0.0003
Firm Size,-0.0023,0.0089,-0.2598,0.7950,-0.0198,0.0151
Market Share,0.4084,2.2939,0.1780,0.8587,-4.0877,4.9045
EPS,0.0915,0.0126,7.2363,0.0000,0.0667,0.1163


In [14]:
interaction_term = 'Exemplar Similarity x Category Distinctiveness'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Distinctiveness']
regressors = iv + moderators + [interaction_term] + controls

model6 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res6 = model6.fit(cov_type='clustered', cluster_entity=True)
res6

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0523
Estimator:,PanelOLS,R-squared (Between):,-0.0285
No. Observations:,40657,R-squared (Within):,0.0523
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.0211
Time:,15:37:31,Log-likelihood,-1.821e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,70.681
Entities:,7342,P-value,0.0000
Avg Obs:,5.5376,Distribution:,"F(26,33289)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-1.5972,1.0297,-1.5511,0.1209,-3.6155,0.4211
Category Coherence,-1.5820,0.3764,-4.2032,0.0000,-2.3198,-0.8443
Category Distinctiveness,0.6832,1.0618,0.6435,0.5199,-1.3978,2.7643
Exemplar Typicality,-0.0190,0.1126,-0.1685,0.8662,-0.2397,0.2017
Exemplar Similarity x Category Distinctiveness,-1.5589,1.2182,-1.2796,0.2007,-3.9466,0.8289
Total Sales,-1.098e-05,2.428e-06,-4.5199,0.0000,-1.574e-05,-6.216e-06
Firm Size,-0.0016,0.0011,-1.4901,0.1362,-0.0038,0.0005
Market Share,-0.2158,0.2132,-1.0122,0.3115,-0.6337,0.2021
EPS,0.0165,0.0018,8.9268,0.0000,0.0128,0.0201


## Moderation Effect of Exemplar Typicality

In [15]:
# Regressions with interaction terms - analyst coverage and category coherence
interaction_term = 'Exemplar Similarity x Exemplar Typicality'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Exemplar Typicality']
regressors = iv + moderators + [interaction_term] + controls

model7 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res7 = model7.fit(cov_type='clustered', cluster_entity=True)
res7

0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1279
Estimator:,PanelOLS,R-squared (Between):,0.2084
No. Observations:,62166,R-squared (Within):,0.1279
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.1785
Time:,15:37:32,Log-likelihood,-1.401e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,298.21
Entities:,9283,P-value,0.0000
Avg Obs:,6.6968,Distribution:,"F(26,52857)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-17.165,2.9708,-5.7779,0.0000,-22.988,-11.342
Category Coherence,18.455,2.2091,8.3540,0.0000,14.125,22.785
Category Distinctiveness,3.6160,1.3570,2.6646,0.0077,0.9562,6.2757
Exemplar Typicality,-19.890,2.8026,-7.0969,0.0000,-25.383,-14.397
Exemplar Similarity x Exemplar Typicality,23.021,3.5633,6.4605,0.0000,16.036,30.005
Total Sales,0.0002,4.223e-05,4.1276,0.0000,9.153e-05,0.0003
Firm Size,-0.0022,0.0089,-0.2494,0.8030,-0.0197,0.0153
Market Share,0.3418,2.2773,0.1501,0.8807,-4.1217,4.8054
EPS,0.0916,0.0126,7.2600,0.0000,0.0669,0.1163


In [16]:
interaction_term = 'Exemplar Similarity x Exemplar Typicality'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Exemplar Typicality']
regressors = iv + moderators + [interaction_term] + controls

model8 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res8 = model8.fit(cov_type='clustered', cluster_entity=True)
res8

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0524
Estimator:,PanelOLS,R-squared (Between):,-0.0214
No. Observations:,40657,R-squared (Within):,0.0524
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.0255
Time:,15:37:32,Log-likelihood,-1.821e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,70.742
Entities:,7342,P-value,0.0000
Avg Obs:,5.5376,Distribution:,"F(26,33289)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,0.8665,0.6241,1.3884,0.1650,-0.3567,2.0898
Category Coherence,-1.6170,0.3746,-4.3161,0.0000,-2.3513,-0.8827
Category Distinctiveness,-0.6589,0.2464,-2.6737,0.0075,-1.1419,-0.1759
Exemplar Typicality,1.0714,0.5925,1.8083,0.0706,-0.0899,2.2327
Exemplar Similarity x Exemplar Typicality,-1.3342,0.7185,-1.8569,0.0633,-2.7425,0.0741
Total Sales,-1.092e-05,2.422e-06,-4.5081,0.0000,-1.567e-05,-6.172e-06
Firm Size,-0.0016,0.0011,-1.4909,0.1360,-0.0038,0.0005
Market Share,-0.2055,0.2133,-0.9634,0.3353,-0.6237,0.2126
EPS,0.0165,0.0018,8.9365,0.0000,0.0129,0.0201


# Output regression results

In [17]:
def create_table(*results):
    table = pd.DataFrame()
    for i, result in enumerate(results, start=1):
        coefficients = result.params
        pvalues = result.pvalues
        nobs = result.nobs
        rsquared = "{:.3f}".format(result.rsquared)
        nentities = result._entity_info['total']

        # Create a DataFrame for this result
        data = {}
        for key in coefficients.keys():
            data[key] = ["{:.3f}".format(coefficients[key])]
            data[key + ' p-val'] = ["'({:.3f})".format(pvalues[key])]  # added single quote
        data['Observations'] = [nobs]
        data['R-squared'] = [rsquared]
        data['Entities'] = [nentities]
        df = pd.DataFrame(data)

        # Assign model name to the index
        df.index = ['Model' + str(i)]

        # Concatenate to the final table
        table = pd.concat([table, df], axis=0)
    return table

In [18]:

# Now create the table with your results:
table = create_table(res1, res2, res3, res4, res5, res6, res7, res8)


In [19]:
table.T.to_csv(f'../data/tables/main_regression_results_naicsh4.csv')