# Import the libraries

In [1]:
import pandas as pd
from linearmodels import PanelOLS
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

# Set the font to Times New Roman
matplotlib.rcParams['font.family'] = 'serif'
matplotlib.rcParams['font.serif'] = 'Times New Roman'

# Prepare the data

In [2]:
ind_col = 'naicsh6'
nlp_model = 'mpnet'
doc2vec_vector_size = 768 # 768 for mpnet, 1536 for ada

data = pd.read_pickle(f'../data/all_reg_vars_{nlp_model}_{str(doc2vec_vector_size)}_{ind_col}.pkl')
data.replace([np.inf, -np.inf], np.nan, inplace=True)


### Create lead variables for the DVs

In [3]:
# Sort data by GVKEY and year
reg_data = data.sort_values(by=['GVKEY', 'year'])
# Lead dependent variables by one year
reg_data['numrec_mean_0_lead'] = reg_data.groupby('GVKEY')['numrec_mean_0'].shift(-1)
reg_data['meanrec_mean_descend_lead'] = reg_data.groupby('GVKEY')['meanrec_mean_descend'].shift(-1)
reg_data['year_lead'] = reg_data.groupby('GVKEY')['year'].shift(-1)
reg_data['year_diff'] = reg_data['year_lead'] - reg_data['year']
reg_data = reg_data[reg_data['year_diff'] <=2 ]
reg_data['constant'] = 1

In [4]:
# Limit the data to non-exemplar firms and industries with at least 10 firms
reg_data = reg_data[(reg_data[f'exemplar_{ind_col}'] == 0) & (reg_data[f'no_firms_{ind_col}'] > 9)]

In [5]:
reg_data['year_lead'] = reg_data.groupby('GVKEY')['year'].shift(-1)

### Rename the variables to make them more readable

In [6]:
# define a dictionary that maps old names to new names
rename_dict = {
    'numrec_mean_0_lead': 'Analyst Coverage',
    'meanrec_mean_descend_lead': 'Analyst Recoms',
    f'exemplar_sim_{ind_col}': 'Exemplar Similarity',
    'sale_wins_1': 'Total Sales',
    'n_emp': 'Firm Size',
    'EPS_wins_1': 'EPS',
    'slack_avail_wins_1': 'Available Slack',
    'rd_f_wins_1': 'R&D Expenditure',
    'adv_f_wins_1': 'Advertising Expenditure',
    'dpt_f_wins_1': 'Depreciation Ratio',
    'intang_f_wins_1': 'Intangible Assets Ratio',
    'n_segments': 'No. Segments',
    f'market_share_{ind_col}': 'Market Share',
    'mergers_wins_1': 'Mergers (expenditure)',
    'leverage_wins_1': 'Financial Leverage',
    'is_spx500': 'S&P500 Dummy',
    f'sim_mean_{ind_col}': 'Firm Typicality',
    'strong_weak_modal_ratio': 'Strong-Weak Modals Ratio',
    'positive_negative_ratio': 'Positive-Negative Words Ratio',
    'N_Litigious': 'Litigous Words Ratio',
    f'n_analysts_{ind_col}': 'No. Analysts in Industry',
    f'no_firms_{ind_col}': 'No. Firms in Industry',
    f'category_coherence_{ind_col}': 'Category Coherence',
    f'numrec_avg_year_{ind_col}': 'Average Coverage (Year-Ind)',
    f'meanrec_avg_year_{ind_col}': 'Average Recoms (Year-Ind)',
    f'ind_sim_all_{ind_col}_distinct': 'Category Distinctiveness',
    f'ind_vecs_change_{ind_col}': 'Category Instability',
    f'hhi_{ind_col}': 'Industry HHI',
    f'EPS_wins_1_ex_{ind_col}': 'Exemplar EPS',
    f'exemplar_{ind_col}_typicality': 'Exemplar Typicality',
    'strategy_unique': 'Strategy Uniqueness',
    'sim_10nearest' : '10-Nearest Similarity',
    'firm_average_analyst_coverage' : 'Analyst Average Portfolio Size',
    'firm_complexity' : 'Firm Complexity',
    'positive_negative_ratio': 'Positive-Negative Words Ratio',
    'strong_weak_modal_ratio' : 'Strong-Weak Modals Ratio',
    'N_Uncertainty' : 'Uncertainty Words Ratio',
    'N_Litigious' : 'Litigious Words Ratio',
    'constant': 'Constant'
}

# rename columns
reg_data = reg_data.rename(columns=rename_dict)


In [7]:
reg_data['Analyst Average Portfolio Size'].describe()

count    41801.000000
mean        18.161131
std          9.022695
min          1.000000
25%         13.600000
50%         17.400000
75%         21.750000
max        203.000000
Name: Analyst Average Portfolio Size, dtype: float64

In [112]:
# replace missing values of word ratios with mean to avoid losing observations
reg_data['Positive-Negative Words Ratio'] = reg_data['Positive-Negative Words Ratio'].fillna(reg_data['Positive-Negative Words Ratio'].mean())
reg_data['Strong-Weak Modals Ratio'] = reg_data['Strong-Weak Modals Ratio'].fillna(reg_data['Strong-Weak Modals Ratio'].mean())
reg_data['Uncertainty Words Ratio'] = reg_data['Uncertainty Words Ratio'].fillna(reg_data['Uncertainty Words Ratio'].mean())
reg_data['Litigious Words Ratio'] = reg_data['Litigious Words Ratio'].fillna(reg_data['Litigious Words Ratio'].mean())


In [113]:
print(reg_data['Strategy Uniqueness'].corr(reg_data['Exemplar Similarity']))
print(reg_data['Strategy Uniqueness'].corr(reg_data['Analyst Average Portfolio Size']))
print(reg_data['Firm Typicality'].corr(reg_data['Analyst Average Portfolio Size']))
print(reg_data['Exemplar Similarity'].corr(reg_data['Analyst Average Portfolio Size']))
print(reg_data['Firm Typicality'].corr(reg_data['Exemplar Similarity']))
print(reg_data['10-Nearest Similarity'].corr(reg_data['Exemplar Similarity']))
print(reg_data['Firm Typicality'].corr(reg_data['Firm Complexity']))

-0.1074931704852451
-0.035294162052742196
0.11670300609786073
0.16977470020910101
0.7476796139665128
0.7882235176243836
-0.009676309736095625


### Define the list of independent/control variables


In [114]:
iv = ['Exemplar Similarity']
moderators = ['Category Coherence', 'Category Distinctiveness','Exemplar Typicality']
controls = [
            'Total Sales', 'Firm Size','Market Share', 'EPS', 'Available Slack', 'R&D Expenditure',
            'Advertising Expenditure', 'Intangible Assets Ratio', 'Depreciation Ratio', 
              'No. Segments',
             'Mergers (expenditure)', 'Financial Leverage', 'S&P500 Dummy', 
             'No. Analysts in Industry', 'No. Firms in Industry',  
             'Average Coverage (Year-Ind)', 'Average Recoms (Year-Ind)',
             'Category Instability', 'Industry HHI', 'Exemplar EPS',
             'Constant']
dv = ['Analyst Coverage', 'Analyst Recoms']

In [115]:
# Set index to GVKEY and year
reg_data = reg_data.set_index(['GVKEY', 'year'])

# Regression Models

## Effect of Exemplar similarity on Analyst Coverage

In [116]:
regressors = iv + moderators + controls
model1 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res1 = model1.fit(cov_type='clustered', cluster_entity=True)
res1


0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1184
Estimator:,PanelOLS,R-squared (Between):,0.2010
No. Observations:,46786,R-squared (Within):,0.1184
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.1779
Time:,16:05:15,Log-likelihood,-1.06e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,219.15
Entities:,7603,P-value,0.0000
Avg Obs:,6.1536,Distribution:,"F(24,39159)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,2.5935,0.4844,5.3539,0.0000,1.6440,3.5430
Category Coherence,21.627,2.3526,9.1926,0.0000,17.016,26.238
Category Distinctiveness,-2.6153,1.9480,-1.3426,0.1794,-6.4334,1.2027
Exemplar Typicality,-1.4134,0.6999,-2.0194,0.0435,-2.7853,-0.0416
Total Sales,0.0001,4.861e-05,2.5857,0.0097,3.042e-05,0.0002
Firm Size,0.0054,0.0074,0.7289,0.4661,-0.0091,0.0199
Market Share,3.0213,2.2850,1.3222,0.1861,-1.4574,7.5000
EPS,0.0678,0.0157,4.3254,0.0000,0.0371,0.0985
Available Slack,0.0053,0.0088,0.5979,0.5499,-0.0120,0.0226


## Effect of Exemplar Similarity on Analyst Recommendations

In [117]:
regressors = iv + moderators + controls
model2 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res2 = model2.fit(cov_type='clustered', cluster_entity=True)
res2

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0526
Estimator:,PanelOLS,R-squared (Between):,0.0297
No. Observations:,30688,R-squared (Within):,0.0526
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.0410
Time:,16:05:16,Log-likelihood,-1.268e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,57.081
Entities:,5975,P-value,0.0000
Avg Obs:,5.1361,Distribution:,"F(24,24689)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-0.2940,0.0984,-2.9868,0.0028,-0.4870,-0.1011
Category Coherence,-1.8056,0.4235,-4.2633,0.0000,-2.6358,-0.9755
Category Distinctiveness,-0.2330,0.2694,-0.8646,0.3873,-0.7611,0.2952
Exemplar Typicality,-0.0478,0.1406,-0.3395,0.7342,-0.3234,0.2279
Total Sales,-3.81e-06,4.048e-06,-0.9412,0.3466,-1.174e-05,4.124e-06
Firm Size,-0.0014,0.0014,-0.9935,0.3205,-0.0041,0.0014
Market Share,-0.3095,0.2092,-1.4799,0.1389,-0.7195,0.1004
EPS,0.0144,0.0020,7.0462,0.0000,0.0104,0.0184
Available Slack,-0.0025,0.0019,-1.3161,0.1882,-0.0061,0.0012


## Moderation Effect of Category Coherence

In [118]:
# Regressions with interaction terms - analyst coverage and category coherence
interaction_term = 'Exemplar Similarity x Category Coherence'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Coherence']
regressors = iv + moderators + [interaction_term] + controls

model3 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res3 = model3.fit(cov_type='clustered', cluster_entity=True)
res3


0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1214
Estimator:,PanelOLS,R-squared (Between):,0.1940
No. Observations:,46786,R-squared (Within):,0.1214
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.1703
Time:,16:05:16,Log-likelihood,-1.059e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,216.42
Entities:,7603,P-value,0.0000
Avg Obs:,6.1536,Distribution:,"F(25,39158)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-48.677,7.1326,-6.8246,0.0000,-62.657,-34.697
Category Coherence,-27.885,7.1830,-3.8821,0.0001,-41.964,-13.806
Category Distinctiveness,-2.7387,1.9308,-1.4185,0.1561,-6.5231,1.0456
Exemplar Typicality,-1.4610,0.6902,-2.1168,0.0343,-2.8137,-0.1082
Exemplar Similarity x Category Coherence,58.376,8.1944,7.1238,0.0000,42.314,74.437
Total Sales,0.0001,4.826e-05,2.5496,0.0108,2.846e-05,0.0002
Firm Size,0.0057,0.0074,0.7688,0.4420,-0.0089,0.0203
Market Share,2.8565,2.2760,1.2550,0.2095,-1.6045,7.3175
EPS,0.0684,0.0156,4.3905,0.0000,0.0379,0.0989


In [119]:
interaction_term = 'Exemplar Similarity x Category Coherence'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Coherence']
regressors = iv + moderators + [interaction_term] + controls

model4 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res4 = model4.fit(cov_type='clustered', cluster_entity=True)
res4

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0529
Estimator:,PanelOLS,R-squared (Between):,0.0327
No. Observations:,30688,R-squared (Within):,0.0529
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.0450
Time:,16:05:16,Log-likelihood,-1.267e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,55.116
Entities:,5975,P-value,0.0000
Avg Obs:,5.1361,Distribution:,"F(25,24688)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,2.6221,1.5058,1.7414,0.0816,-0.3292,5.5735
Category Coherence,1.0545,1.5116,0.6976,0.4854,-1.9084,4.0174
Category Distinctiveness,-0.2430,0.2700,-0.9001,0.3681,-0.7721,0.2861
Exemplar Typicality,-0.0466,0.1401,-0.3322,0.7397,-0.3213,0.2281
Exemplar Similarity x Category Coherence,-3.3067,1.7013,-1.9436,0.0520,-6.6414,0.0280
Total Sales,-3.705e-06,4.044e-06,-0.9162,0.3596,-1.163e-05,4.222e-06
Firm Size,-0.0014,0.0014,-0.9970,0.3188,-0.0042,0.0014
Market Share,-0.3009,0.2085,-1.4434,0.1489,-0.7095,0.1077
EPS,0.0144,0.0020,7.0475,0.0000,0.0104,0.0184


## Moderation Effect of Category Distinctiveness

In [120]:
# Regressions with interaction terms - analyst coverage and category coherence
interaction_term = 'Exemplar Similarity x Category Distinctiveness'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Distinctiveness']
regressors = iv + moderators + [interaction_term] + controls

model5 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res5 = model5.fit(cov_type='clustered', cluster_entity=True)
res5


0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1206
Estimator:,PanelOLS,R-squared (Between):,0.2048
No. Observations:,46786,R-squared (Within):,0.1206
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.1810
Time:,16:05:17,Log-likelihood,-1.059e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,214.72
Entities:,7603,P-value,0.0000
Avg Obs:,6.1536,Distribution:,"F(25,39158)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,33.855,5.0210,6.7426,0.0000,24.013,43.696
Category Coherence,21.071,2.3421,8.9969,0.0000,16.481,25.662
Category Distinctiveness,-36.030,5.6181,-6.4132,0.0000,-47.042,-25.018
Exemplar Typicality,-0.8915,0.7010,-1.2718,0.2035,-2.2654,0.4825
Exemplar Similarity x Category Distinctiveness,38.828,6.2060,6.2565,0.0000,26.664,50.992
Total Sales,0.0001,4.875e-05,2.5395,0.0111,2.825e-05,0.0002
Firm Size,0.0056,0.0074,0.7530,0.4515,-0.0090,0.0202
Market Share,2.9438,2.2788,1.2918,0.1964,-1.5227,7.4103
EPS,0.0682,0.0156,4.3673,0.0000,0.0376,0.0987


In [121]:
interaction_term = 'Exemplar Similarity x Category Distinctiveness'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Distinctiveness']
regressors = iv + moderators + [interaction_term] + controls

model6 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res6 = model6.fit(cov_type='clustered', cluster_entity=True)
res6

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0530
Estimator:,PanelOLS,R-squared (Between):,0.0188
No. Observations:,30688,R-squared (Within):,0.0530
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.0341
Time:,16:05:17,Log-likelihood,-1.267e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,55.256
Entities:,5975,P-value,0.0000
Avg Obs:,5.1361,Distribution:,"F(25,24688)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-2.7531,1.0368,-2.6553,0.0079,-4.7853,-0.7208
Category Coherence,-1.7684,0.4229,-4.1815,0.0000,-2.5974,-0.9395
Category Distinctiveness,2.4133,1.1290,2.1376,0.0326,0.2004,4.6262
Exemplar Typicality,-0.0790,0.1420,-0.5563,0.5780,-0.3574,0.1994
Exemplar Similarity x Category Distinctiveness,-3.0431,1.2813,-2.3750,0.0176,-5.5546,-0.5316
Total Sales,-3.671e-06,4.027e-06,-0.9116,0.3620,-1.157e-05,4.222e-06
Firm Size,-0.0014,0.0014,-1.0011,0.3168,-0.0041,0.0013
Market Share,-0.3004,0.2072,-1.4503,0.1470,-0.7065,0.1056
EPS,0.0143,0.0020,7.0041,0.0000,0.0103,0.0183


## Moderation Effect of Exemplar Typicality

In [122]:
# Regressions with interaction terms - analyst coverage and category coherence
interaction_term = 'Exemplar Similarity x Exemplar Typicality'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Exemplar Typicality']
regressors = iv + moderators + [interaction_term] + controls

model7 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res7 = model7.fit(cov_type='clustered', cluster_entity=True)
res7

0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1194
Estimator:,PanelOLS,R-squared (Between):,0.1996
No. Observations:,46786,R-squared (Within):,0.1194
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.1762
Time:,16:05:17,Log-likelihood,-1.059e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,212.41
Entities:,7603,P-value,0.0000
Avg Obs:,6.1536,Distribution:,"F(25,39158)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-13.343,3.2679,-4.0830,0.0000,-19.748,-6.9378
Category Coherence,21.156,2.3421,9.0331,0.0000,16.566,25.747
Category Distinctiveness,-2.7506,1.9388,-1.4187,0.1560,-6.5507,1.0495
Exemplar Typicality,-15.759,3.0255,-5.2087,0.0000,-21.689,-9.8288
Exemplar Similarity x Exemplar Typicality,18.021,3.7872,4.7584,0.0000,10.598,25.444
Total Sales,0.0001,4.851e-05,2.5642,0.0103,2.931e-05,0.0002
Firm Size,0.0055,0.0074,0.7444,0.4567,-0.0090,0.0201
Market Share,2.9371,2.2818,1.2872,0.1980,-1.5353,7.4095
EPS,0.0680,0.0156,4.3535,0.0000,0.0374,0.0987


In [123]:
interaction_term = 'Exemplar Similarity x Exemplar Typicality'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Exemplar Typicality']
regressors = iv + moderators + [interaction_term] + controls

model8 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res8 = model8.fit(cov_type='clustered', cluster_entity=True)
res8

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0532
Estimator:,PanelOLS,R-squared (Between):,0.0320
No. Observations:,30688,R-squared (Within):,0.0532
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.0442
Time:,16:05:18,Log-likelihood,-1.267e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,55.450
Entities:,5975,P-value,0.0000
Avg Obs:,5.1361,Distribution:,"F(25,24688)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,1.8880,0.6896,2.7378,0.0062,0.5363,3.2396
Category Coherence,-1.7399,0.4212,-4.1313,0.0000,-2.5654,-0.9144
Category Distinctiveness,-0.2208,0.2709,-0.8150,0.4151,-0.7516,0.3101
Exemplar Typicality,1.9458,0.6427,3.0277,0.0025,0.6861,3.2054
Exemplar Similarity x Exemplar Typicality,-2.4589,0.7753,-3.1716,0.0015,-3.9785,-0.9393
Total Sales,-3.723e-06,4.066e-06,-0.9155,0.3599,-1.169e-05,4.248e-06
Firm Size,-0.0014,0.0014,-1.0005,0.3171,-0.0042,0.0014
Market Share,-0.2926,0.2090,-1.4004,0.1614,-0.7022,0.1169
EPS,0.0144,0.0020,7.0614,0.0000,0.0104,0.0184


# Output regression results

In [124]:
def create_table(*results):
    table = pd.DataFrame()
    for i, result in enumerate(results, start=1):
        coefficients = result.params
        pvalues = result.pvalues
        nobs = result.nobs
        rsquared = "{:.3f}".format(result.rsquared)
        nentities = result._entity_info['total']

        # Create a DataFrame for this result
        data = {}
        for key in coefficients.keys():
            data[key] = ["{:.3f}".format(coefficients[key])]
            data[key + ' p-val'] = ["'({:.3f})".format(pvalues[key])]  # added single quote
        data['Observations'] = [nobs]
        data['R-squared'] = [rsquared]
        data['Entities'] = [nentities]
        df = pd.DataFrame(data)

        # Assign model name to the index
        df.index = ['Model' + str(i)]

        # Concatenate to the final table
        table = pd.concat([table, df], axis=0)
    return table

In [125]:

# Now create the table with your results:
table = create_table(res1, res2, res3, res4, res5, res6, res7, res8)


In [126]:
table.T.to_csv(f'../data/tables/main_regression_results_notypicality_control.csv')