# Import the libraries

In [1]:
import pandas as pd
from linearmodels import PanelOLS
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

# Set the font to Times New Roman
matplotlib.rcParams['font.family'] = 'serif'
matplotlib.rcParams['font.serif'] = 'Times New Roman'

# Prepare the data

In [2]:
ind_col = 'sich3'
nlp_model = 'mpnet'
doc2vec_vector_size = 768 # 768 for mpnet, 1536 for ada

data = pd.read_pickle(f'../data/all_reg_vars_{nlp_model}_{str(doc2vec_vector_size)}_{ind_col}.pkl')
data.replace([np.inf, -np.inf], np.nan, inplace=True)


### Create lead variables for the DVs

In [3]:
# Sort data by GVKEY and year
reg_data = data.sort_values(by=['GVKEY', 'year'])
# Lead dependent variables by one year
reg_data['numrec_mean_0_lead'] = reg_data.groupby('GVKEY')['numrec_mean_0'].shift(-1)
reg_data['meanrec_mean_descend_lead'] = reg_data.groupby('GVKEY')['meanrec_mean_descend'].shift(-1)
reg_data['year_lead'] = reg_data.groupby('GVKEY')['year'].shift(-1)
reg_data['year_diff'] = reg_data['year_lead'] - reg_data['year']
reg_data = reg_data[reg_data['year_diff'] <=2 ]
reg_data['constant'] = 1

In [4]:
# Limit the data to non-exemplar firms and industries with at least 10 firms
reg_data = reg_data[(reg_data[f'exemplar_{ind_col}'] == 0) & (reg_data[f'no_firms_{ind_col}'] > 9)]

In [5]:
reg_data['year_lead'] = reg_data.groupby('GVKEY')['year'].shift(-1)


### Rename the variables to make them more readable

In [6]:
# define a dictionary that maps old names to new names
rename_dict = {
    'numrec_mean_0_lead': 'Analyst Coverage',
    'meanrec_mean_descend_lead': 'Analyst Recoms',
    f'exemplar_sim_{ind_col}': 'Exemplar Similarity',
    'sale_wins_1': 'Total Sales',
    'n_emp': 'Firm Size',
    'EPS_wins_1': 'EPS',
    'slack_avail_wins_1': 'Available Slack',
    'rd_f_wins_1': 'R&D Expenditure',
    'adv_f_wins_1': 'Advertising Expenditure',
    'dpt_f_wins_1': 'Depreciation Ratio',
    'intang_f_wins_1': 'Intangible Assets Ratio',
    'n_segments': 'No. Segments',
    f'market_share_{ind_col}': 'Market Share',
    'mergers_wins_1': 'Mergers (expenditure)',
    'leverage_wins_1': 'Financial Leverage',
    'is_spx500': 'S&P500 Dummy',
    f'sim_mean_{ind_col}': 'Firm Typicality',
    'strong_weak_modal_ratio': 'Strong-Weak Modals Ratio',
    'positive_negative_ratio': 'Positive-Negative Words Ratio',
    'N_Litigious': 'Litigous Words Ratio',
    f'n_analysts_{ind_col}': 'No. Analysts in Industry',
    f'no_firms_{ind_col}': 'No. Firms in Industry',
    f'category_coherence_{ind_col}': 'Category Coherence',
    f'numrec_avg_year_{ind_col}': 'Average Coverage (Year-Ind)',
    f'meanrec_avg_year_{ind_col}': 'Average Recoms (Year-Ind)',
    f'ind_sim_all_{ind_col}_distinct': 'Category Distinctiveness',
    f'ind_vecs_change_{ind_col}': 'Category Instability',
    f'hhi_{ind_col}': 'Industry HHI',
    f'EPS_wins_1_ex_{ind_col}': 'Exemplar EPS',
    f'exemplar_{ind_col}_typicality': 'Exemplar Typicality',
    'constant': 'Constant'
}

# rename columns
reg_data = reg_data.rename(columns=rename_dict)


### Define the list of independent/control variables


In [7]:
iv = ['Exemplar Similarity']
moderators = ['Category Coherence', 'Category Distinctiveness','Exemplar Typicality']
controls = [
            'Total Sales', 'Firm Size','Market Share', 'EPS', 'Available Slack', 'R&D Expenditure',
            'Advertising Expenditure', 'Intangible Assets Ratio', 'Depreciation Ratio', 
            'Firm Typicality',  'No. Segments',
             'Mergers (expenditure)', 'Financial Leverage', 'S&P500 Dummy', 
             'No. Analysts in Industry', 'No. Firms in Industry',  
             'Average Coverage (Year-Ind)', 'Average Recoms (Year-Ind)',
             'Category Instability', 'Industry HHI', 'Exemplar EPS',
             'Constant']
dv = ['Analyst Coverage', 'Analyst Recoms']

In [8]:
# Set index to GVKEY and year
reg_data = reg_data.set_index(['GVKEY', 'year'])

# Regression Models

## Effect of Exemplar similarity on Analyst Coverage

In [9]:
regressors = iv + moderators + controls
model1 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res1 = model1.fit(cov_type='clustered', cluster_entity=True)
res1


0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1227
Estimator:,PanelOLS,R-squared (Between):,0.1858
No. Observations:,55639,R-squared (Within):,0.1227
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.1695
Time:,15:39:04,Log-likelihood,-1.255e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,262.88
Entities:,8615,P-value,0.0000
Avg Obs:,6.4584,Distribution:,"F(25,46999)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,3.3763,0.5167,6.5346,0.0000,2.3636,4.3890
Category Coherence,20.016,2.2979,8.7105,0.0000,15.512,24.520
Category Distinctiveness,3.4615,1.4168,2.4432,0.0146,0.6845,6.2385
Exemplar Typicality,-0.8327,0.5579,-1.4925,0.1356,-1.9261,0.2608
Total Sales,0.0002,4.188e-05,3.6354,0.0003,7.017e-05,0.0002
Firm Size,0.0006,0.0100,0.0578,0.9539,-0.0191,0.0202
Market Share,-1.4438,2.0710,-0.6972,0.4857,-5.5029,2.6153
EPS,0.0759,0.0137,5.5444,0.0000,0.0491,0.1027
Available Slack,0.0105,0.0077,1.3719,0.1701,-0.0045,0.0256


## Effect of Exemplar Similarity on Analyst Recommendations

In [10]:
regressors = iv + moderators + controls
model2 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res2 = model2.fit(cov_type='clustered', cluster_entity=True)
res2

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0467
Estimator:,PanelOLS,R-squared (Between):,-0.0084
No. Observations:,36479,R-squared (Within):,0.0467
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.0247
Time:,15:39:05,Log-likelihood,-1.597e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,58.247
Entities:,6758,P-value,0.0000
Avg Obs:,5.3979,Distribution:,"F(25,29696)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-0.2429,0.0952,-2.5509,0.0108,-0.4296,-0.0563
Category Coherence,-0.9742,0.4505,-2.1627,0.0306,-1.8572,-0.0913
Category Distinctiveness,-0.5912,0.2263,-2.6128,0.0090,-1.0347,-0.1477
Exemplar Typicality,-0.0473,0.1112,-0.4257,0.6703,-0.2653,0.1706
Total Sales,-4.221e-06,3.693e-06,-1.1429,0.2531,-1.146e-05,3.018e-06
Firm Size,-0.0024,0.0010,-2.3534,0.0186,-0.0044,-0.0004
Market Share,-0.3549,0.2057,-1.7251,0.0845,-0.7581,0.0483
EPS,0.0144,0.0019,7.4290,0.0000,0.0106,0.0182
Available Slack,-0.0040,0.0017,-2.3133,0.0207,-0.0073,-0.0006


## Moderation Effect of Category Coherence

In [11]:
# Regressions with interaction terms - analyst coverage and category coherence
interaction_term = 'Exemplar Similarity x Category Coherence'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Coherence']
regressors = iv + moderators + [interaction_term] + controls

model3 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res3 = model3.fit(cov_type='clustered', cluster_entity=True)
res3


0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1258
Estimator:,PanelOLS,R-squared (Between):,0.1725
No. Observations:,55639,R-squared (Within):,0.1258
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.1574
Time:,15:39:05,Log-likelihood,-1.254e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,260.21
Entities:,8615,P-value,0.0000
Avg Obs:,6.4584,Distribution:,"F(26,46998)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-82.233,11.087,-7.4174,0.0000,-103.96,-60.503
Category Coherence,-61.740,10.182,-6.0635,0.0000,-81.697,-41.783
Category Distinctiveness,3.0424,1.4102,2.1574,0.0310,0.2783,5.8065
Exemplar Typicality,-1.2959,0.5532,-2.3427,0.0191,-2.3802,-0.2117
Exemplar Similarity x Category Coherence,96.989,12.645,7.6700,0.0000,72.204,121.77
Total Sales,0.0001,4.143e-05,3.5793,0.0003,6.71e-05,0.0002
Firm Size,0.0008,0.0100,0.0812,0.9353,-0.0188,0.0204
Market Share,-1.4018,2.0777,-0.6747,0.4999,-5.4742,2.6707
EPS,0.0756,0.0136,5.5425,0.0000,0.0489,0.1024


In [12]:
interaction_term = 'Exemplar Similarity x Category Coherence'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Coherence']
regressors = iv + moderators + [interaction_term] + controls

model4 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res4 = model4.fit(cov_type='clustered', cluster_entity=True)
res4

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0471
Estimator:,PanelOLS,R-squared (Between):,-0.0029
No. Observations:,36479,R-squared (Within):,0.0471
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.0297
Time:,15:39:06,Log-likelihood,-1.597e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,56.412
Entities:,6758,P-value,0.0000
Avg Obs:,5.3979,Distribution:,"F(26,29695)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,4.3770,2.0545,2.1304,0.0331,0.3500,8.4039
Category Coherence,3.4913,2.0103,1.7367,0.0825,-0.4490,7.4316
Category Distinctiveness,-0.5743,0.2259,-2.5425,0.0110,-1.0170,-0.1315
Exemplar Typicality,-0.0184,0.1124,-0.1636,0.8700,-0.2386,0.2018
Exemplar Similarity x Category Coherence,-5.2303,2.3178,-2.2565,0.0240,-9.7733,-0.6872
Total Sales,-4.022e-06,3.683e-06,-1.0921,0.2748,-1.124e-05,3.196e-06
Firm Size,-0.0024,0.0010,-2.3508,0.0187,-0.0044,-0.0004
Market Share,-0.3535,0.2060,-1.7162,0.0861,-0.7572,0.0502
EPS,0.0144,0.0019,7.4613,0.0000,0.0106,0.0182


## Moderation Effect of Category Distinctiveness

In [13]:
# Regressions with interaction terms - analyst coverage and category coherence
interaction_term = 'Exemplar Similarity x Category Distinctiveness'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Distinctiveness']
regressors = iv + moderators + [interaction_term] + controls

model5 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res5 = model5.fit(cov_type='clustered', cluster_entity=True)
res5


0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1242
Estimator:,PanelOLS,R-squared (Between):,0.1847
No. Observations:,55639,R-squared (Within):,0.1242
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.1699
Time:,15:39:06,Log-likelihood,-1.254e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,256.45
Entities:,8615,P-value,0.0000
Avg Obs:,6.4584,Distribution:,"F(26,46998)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,32.022,4.7184,6.7867,0.0000,22.774,41.270
Category Coherence,19.636,2.2910,8.5707,0.0000,15.145,24.126
Category Distinctiveness,-24.507,4.7082,-5.2051,0.0000,-33.735,-15.279
Exemplar Typicality,-0.5847,0.5598,-1.0445,0.2963,-1.6818,0.5125
Exemplar Similarity x Category Distinctiveness,32.706,5.3694,6.0913,0.0000,22.182,43.230
Total Sales,0.0002,4.212e-05,3.6133,0.0003,6.963e-05,0.0002
Firm Size,0.0007,0.0101,0.0673,0.9464,-0.0190,0.0204
Market Share,-1.4481,2.0725,-0.6987,0.4847,-5.5102,2.6140
EPS,0.0763,0.0137,5.5801,0.0000,0.0495,0.1030


In [14]:
interaction_term = 'Exemplar Similarity x Category Distinctiveness'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Distinctiveness']
regressors = iv + moderators + [interaction_term] + controls

model6 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res6 = model6.fit(cov_type='clustered', cluster_entity=True)
res6

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0470
Estimator:,PanelOLS,R-squared (Between):,-0.0169
No. Observations:,36479,R-squared (Within):,0.0470
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.0201
Time:,15:39:06,Log-likelihood,-1.597e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,56.306
Entities:,6758,P-value,0.0000
Avg Obs:,5.3979,Distribution:,"F(26,29695)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-2.3045,1.0924,-2.1095,0.0349,-4.4456,-0.1633
Category Coherence,-0.9488,0.4519,-2.0996,0.0358,-1.8344,-0.0631
Category Distinctiveness,1.4498,1.0892,1.3311,0.1832,-0.6851,3.5846
Exemplar Typicality,-0.0706,0.1129,-0.6256,0.5316,-0.2920,0.1507
Exemplar Similarity x Category Distinctiveness,-2.3442,1.2409,-1.8891,0.0589,-4.7764,0.0880
Total Sales,-4.197e-06,3.68e-06,-1.1404,0.2541,-1.141e-05,3.017e-06
Firm Size,-0.0024,0.0010,-2.3754,0.0175,-0.0044,-0.0004
Market Share,-0.3543,0.2064,-1.7163,0.0861,-0.7589,0.0503
EPS,0.0143,0.0019,7.3961,0.0000,0.0105,0.0181


## Moderation Effect of Exemplar Typicality

In [15]:
# Regressions with interaction terms - analyst coverage and category coherence
interaction_term = 'Exemplar Similarity x Exemplar Typicality'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Exemplar Typicality']
regressors = iv + moderators + [interaction_term] + controls

model7 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res7 = model7.fit(cov_type='clustered', cluster_entity=True)
res7

0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1240
Estimator:,PanelOLS,R-squared (Between):,0.1833
No. Observations:,55639,R-squared (Within):,0.1240
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.1671
Time:,15:39:07,Log-likelihood,-1.255e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,255.91
Entities:,8615,P-value,0.0000
Avg Obs:,6.4584,Distribution:,"F(26,46998)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-21.984,4.3619,-5.0399,0.0000,-30.533,-13.434
Category Coherence,19.662,2.2887,8.5908,0.0000,15.176,24.148
Category Distinctiveness,3.1881,1.4157,2.2520,0.0243,0.4133,5.9629
Exemplar Typicality,-24.080,3.9031,-6.1695,0.0000,-31.730,-16.430
Exemplar Similarity x Exemplar Typicality,28.988,5.0454,5.7454,0.0000,19.099,38.877
Total Sales,0.0002,4.177e-05,3.5912,0.0003,6.814e-05,0.0002
Firm Size,0.0007,0.0101,0.0696,0.9445,-0.0191,0.0205
Market Share,-1.4421,2.0635,-0.6989,0.4846,-5.4865,2.6023
EPS,0.0769,0.0136,5.6338,0.0000,0.0501,0.1036


In [16]:
interaction_term = 'Exemplar Similarity x Exemplar Typicality'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Exemplar Typicality']
regressors = iv + moderators + [interaction_term] + controls

model8 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res8 = model8.fit(cov_type='clustered', cluster_entity=True)
res8

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0471
Estimator:,PanelOLS,R-squared (Between):,-0.0068
No. Observations:,36479,R-squared (Within):,0.0471
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.0267
Time:,15:39:07,Log-likelihood,-1.597e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,56.419
Entities:,6758,P-value,0.0000
Avg Obs:,5.3979,Distribution:,"F(26,29695)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,1.8382,0.7835,2.3462,0.0190,0.3026,3.3739
Category Coherence,-0.9534,0.4498,-2.1196,0.0341,-1.8350,-0.0718
Category Distinctiveness,-0.5716,0.2262,-2.5266,0.0115,-1.0151,-0.1282
Exemplar Typicality,1.8844,0.7330,2.5708,0.0102,0.4477,3.3211
Exemplar Similarity x Exemplar Typicality,-2.3861,0.8897,-2.6820,0.0073,-4.1299,-0.6423
Total Sales,-4.045e-06,3.677e-06,-1.1000,0.2714,-1.125e-05,3.163e-06
Firm Size,-0.0024,0.0010,-2.3552,0.0185,-0.0044,-0.0004
Market Share,-0.3577,0.2060,-1.7363,0.0825,-0.7616,0.0461
EPS,0.0143,0.0019,7.3800,0.0000,0.0105,0.0181


# Output regression results

In [17]:
def create_table(*results):
    table = pd.DataFrame()
    for i, result in enumerate(results, start=1):
        coefficients = result.params
        pvalues = result.pvalues
        nobs = result.nobs
        rsquared = "{:.3f}".format(result.rsquared)
        nentities = result._entity_info['total']

        # Create a DataFrame for this result
        data = {}
        for key in coefficients.keys():
            data[key] = ["{:.3f}".format(coefficients[key])]
            data[key + ' p-val'] = ["'({:.3f})".format(pvalues[key])]  # added single quote
        data['Observations'] = [nobs]
        data['R-squared'] = [rsquared]
        data['Entities'] = [nentities]
        df = pd.DataFrame(data)

        # Assign model name to the index
        df.index = ['Model' + str(i)]

        # Concatenate to the final table
        table = pd.concat([table, df], axis=0)
    return table

In [18]:

# Now create the table with your results:
table = create_table(res1, res2, res3, res4, res5, res6, res7, res8)


In [19]:
table.T.to_csv(f'../data/tables/main_regression_results_sich3.csv')