# Import the libraries

In [1]:
import pandas as pd
from linearmodels import PanelOLS
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

# Set the font to Times New Roman
matplotlib.rcParams['font.family'] = 'serif'
matplotlib.rcParams['font.serif'] = 'Times New Roman'

# Prepare the data

In [2]:
ind_col = 'sich4'
nlp_model = 'mpnet'
doc2vec_vector_size = 768 # 768 for mpnet, 1536 for ada

data = pd.read_pickle(f'../data/all_reg_vars_{nlp_model}_{str(doc2vec_vector_size)}_{ind_col}.pkl')
data.replace([np.inf, -np.inf], np.nan, inplace=True)


### Create lead variables for the DVs

In [3]:
# Sort data by GVKEY and year
reg_data = data.sort_values(by=['GVKEY', 'year'])
# Lead dependent variables by one year
reg_data['numrec_mean_0_lead'] = reg_data.groupby('GVKEY')['numrec_mean_0'].shift(-1)
reg_data['meanrec_mean_descend_lead'] = reg_data.groupby('GVKEY')['meanrec_mean_descend'].shift(-1)
reg_data['year_lead'] = reg_data.groupby('GVKEY')['year'].shift(-1)
reg_data['year_diff'] = reg_data['year_lead'] - reg_data['year']
reg_data = reg_data[reg_data['year_diff'] <=2 ]
reg_data['constant'] = 1

In [4]:
# Limit the data to non-exemplar firms and industries with at least 10 firms
reg_data = reg_data[(reg_data[f'exemplar_{ind_col}'] == 0) & (reg_data[f'no_firms_{ind_col}'] > 9)]

In [5]:
reg_data['year_lead'] = reg_data.groupby('GVKEY')['year'].shift(-1)


### Rename the variables to make them more readable

In [6]:
# define a dictionary that maps old names to new names
rename_dict = {
    'numrec_mean_0_lead': 'Analyst Coverage',
    'meanrec_mean_descend_lead': 'Analyst Recoms',
    f'exemplar_sim_{ind_col}': 'Exemplar Similarity',
    'sale_wins_1': 'Total Sales',
    'n_emp': 'Firm Size',
    'EPS_wins_1': 'EPS',
    'slack_avail_wins_1': 'Available Slack',
    'rd_f_wins_1': 'R&D Expenditure',
    'adv_f_wins_1': 'Advertising Expenditure',
    'dpt_f_wins_1': 'Depreciation Ratio',
    'intang_f_wins_1': 'Intangible Assets Ratio',
    'n_segments': 'No. Segments',
    f'market_share_{ind_col}': 'Market Share',
    'mergers_wins_1': 'Mergers (expenditure)',
    'leverage_wins_1': 'Financial Leverage',
    'is_spx500': 'S&P500 Dummy',
    f'sim_mean_{ind_col}': 'Firm Typicality',
    'strong_weak_modal_ratio': 'Strong-Weak Modals Ratio',
    'positive_negative_ratio': 'Positive-Negative Words Ratio',
    'N_Litigious': 'Litigous Words Ratio',
    f'n_analysts_{ind_col}': 'No. Analysts in Industry',
    f'no_firms_{ind_col}': 'No. Firms in Industry',
    f'category_coherence_{ind_col}': 'Category Coherence',
    f'numrec_avg_year_{ind_col}': 'Average Coverage (Year-Ind)',
    f'meanrec_avg_year_{ind_col}': 'Average Recoms (Year-Ind)',
    f'ind_sim_all_{ind_col}_distinct': 'Category Distinctiveness',
    f'ind_vecs_change_{ind_col}': 'Category Instability',
    f'hhi_{ind_col}': 'Industry HHI',
    f'EPS_wins_1_ex_{ind_col}': 'Exemplar EPS',
    f'exemplar_{ind_col}_typicality': 'Exemplar Typicality',
    'constant': 'Constant'
}

# rename columns
reg_data = reg_data.rename(columns=rename_dict)


### Define the list of independent/control variables


In [7]:
iv = ['Exemplar Similarity']
moderators = ['Category Coherence', 'Category Distinctiveness','Exemplar Typicality']
controls = [
            'Total Sales', 'Firm Size','Market Share', 'EPS', 'Available Slack', 'R&D Expenditure',
            'Advertising Expenditure', 'Intangible Assets Ratio', 'Depreciation Ratio', 
            'Firm Typicality',  'No. Segments',
             'Mergers (expenditure)', 'Financial Leverage', 'S&P500 Dummy', 
             'No. Analysts in Industry', 'No. Firms in Industry',  
             'Average Coverage (Year-Ind)', 'Average Recoms (Year-Ind)',
             'Category Instability', 'Industry HHI', 'Exemplar EPS',
             'Constant']
dv = ['Analyst Coverage', 'Analyst Recoms']

In [8]:
# Set index to GVKEY and year
reg_data = reg_data.set_index(['GVKEY', 'year'])

# Regression Models

## Effect of Exemplar similarity on Analyst Coverage

In [9]:
regressors = iv + moderators + controls
model1 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res1 = model1.fit(cov_type='clustered', cluster_entity=True)
res1


0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1229
Estimator:,PanelOLS,R-squared (Between):,0.2004
No. Observations:,50791,R-squared (Within):,0.1229
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.1715
Time:,15:37:58,Log-likelihood,-1.152e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,238.15
Entities:,8293,P-value,0.0000
Avg Obs:,6.1246,Distribution:,"F(25,42473)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,3.0666,0.5617,5.4593,0.0000,1.9656,4.1676
Category Coherence,14.201,2.4877,5.7084,0.0000,9.3249,19.077
Category Distinctiveness,2.7243,1.3699,1.9887,0.0467,0.0392,5.4093
Exemplar Typicality,-1.4840,0.7043,-2.1072,0.0351,-2.8644,-0.1036
Total Sales,0.0002,5.238e-05,3.1403,0.0017,6.183e-05,0.0003
Firm Size,0.0022,0.0091,0.2440,0.8072,-0.0157,0.0201
Market Share,-0.7638,1.8530,-0.4122,0.6802,-4.3958,2.8682
EPS,0.0663,0.0142,4.6818,0.0000,0.0385,0.0940
Available Slack,0.0076,0.0082,0.9328,0.3509,-0.0084,0.0237


## Effect of Exemplar Similarity on Analyst Recommendations

In [10]:
regressors = iv + moderators + controls
model2 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res2 = model2.fit(cov_type='clustered', cluster_entity=True)
res2

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0491
Estimator:,PanelOLS,R-squared (Between):,0.0355
No. Observations:,33478,R-squared (Within):,0.0491
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.0500
Time:,15:37:58,Log-likelihood,-1.4e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,55.665
Entities:,6496,P-value,0.0000
Avg Obs:,5.1536,Distribution:,"F(25,26957)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-0.2923,0.1032,-2.8320,0.0046,-0.4946,-0.0900
Category Coherence,-0.9183,0.4290,-2.1405,0.0323,-1.7593,-0.0774
Category Distinctiveness,-0.2829,0.2132,-1.3269,0.1846,-0.7009,0.1350
Exemplar Typicality,0.0866,0.1312,0.6595,0.5096,-0.1707,0.3438
Total Sales,-6.008e-06,2.941e-06,-2.0428,0.0411,-1.177e-05,-2.435e-07
Firm Size,-0.0025,0.0011,-2.1474,0.0318,-0.0047,-0.0002
Market Share,-0.3085,0.1592,-1.9375,0.0527,-0.6206,0.0036
EPS,0.0143,0.0020,7.2145,0.0000,0.0104,0.0182
Available Slack,-0.0036,0.0018,-2.0094,0.0445,-0.0070,-8.736e-05


## Moderation Effect of Category Coherence

In [11]:
# Regressions with interaction terms - analyst coverage and category coherence
interaction_term = 'Exemplar Similarity x Category Coherence'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Coherence']
regressors = iv + moderators + [interaction_term] + controls

model3 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res3 = model3.fit(cov_type='clustered', cluster_entity=True)
res3


0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1254
Estimator:,PanelOLS,R-squared (Between):,0.1919
No. Observations:,50791,R-squared (Within):,0.1254
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.1624
Time:,15:37:59,Log-likelihood,-1.151e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,234.24
Entities:,8293,P-value,0.0000
Avg Obs:,6.1246,Distribution:,"F(26,42472)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-65.504,12.358,-5.3005,0.0000,-89.726,-41.282
Category Coherence,-50.640,11.931,-4.2444,0.0000,-74.025,-27.255
Category Distinctiveness,2.2453,1.3672,1.6422,0.1006,-0.4345,4.9251
Exemplar Typicality,-1.7022,0.6952,-2.4487,0.0143,-3.0647,-0.3397
Exemplar Similarity x Category Coherence,77.739,14.044,5.5355,0.0000,50.213,105.26
Total Sales,0.0002,5.188e-05,3.0834,0.0020,5.828e-05,0.0003
Firm Size,0.0026,0.0091,0.2823,0.7777,-0.0153,0.0204
Market Share,-0.8122,1.8501,-0.4390,0.6607,-4.4385,2.8141
EPS,0.0675,0.0141,4.7959,0.0000,0.0399,0.0951


In [12]:
interaction_term = 'Exemplar Similarity x Category Coherence'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Coherence']
regressors = iv + moderators + [interaction_term] + controls

model4 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res4 = model4.fit(cov_type='clustered', cluster_entity=True)
res4

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0494
Estimator:,PanelOLS,R-squared (Between):,0.0395
No. Observations:,33478,R-squared (Within):,0.0494
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.0540
Time:,15:37:59,Log-likelihood,-1.399e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,53.869
Entities:,6496,P-value,0.0000
Avg Obs:,5.1536,Distribution:,"F(26,26956)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,3.7242,1.8428,2.0209,0.0433,0.1122,7.3363
Category Coherence,2.9413,1.7936,1.6399,0.1010,-0.5742,6.4568
Category Distinctiveness,-0.2682,0.2127,-1.2610,0.2073,-0.6850,0.1487
Exemplar Typicality,0.0991,0.1311,0.7564,0.4494,-0.1578,0.3561
Exemplar Similarity x Category Coherence,-4.5513,2.0853,-2.1825,0.0291,-8.6387,-0.4640
Total Sales,-5.763e-06,2.932e-06,-1.9657,0.0493,-1.151e-05,-1.652e-08
Firm Size,-0.0025,0.0012,-2.1478,0.0317,-0.0047,-0.0002
Market Share,-0.3062,0.1594,-1.9208,0.0548,-0.6187,0.0063
EPS,0.0143,0.0020,7.1927,0.0000,0.0104,0.0182


## Moderation Effect of Category Distinctiveness

In [13]:
# Regressions with interaction terms - analyst coverage and category coherence
interaction_term = 'Exemplar Similarity x Category Distinctiveness'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Distinctiveness']
regressors = iv + moderators + [interaction_term] + controls

model5 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res5 = model5.fit(cov_type='clustered', cluster_entity=True)
res5


0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1235
Estimator:,PanelOLS,R-squared (Between):,0.2013
No. Observations:,50791,R-squared (Within):,0.1235
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.1727
Time:,15:38:00,Log-likelihood,-1.151e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,230.14
Entities:,8293,P-value,0.0000
Avg Obs:,6.1246,Distribution:,"F(26,42472)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,19.107,5.0275,3.8005,0.0001,9.2530,28.961
Category Coherence,13.882,2.4680,5.6249,0.0000,9.0450,18.720
Category Distinctiveness,-13.169,5.1722,-2.5461,0.0109,-23.307,-3.0313
Exemplar Typicality,-1.2440,0.7105,-1.7508,0.0800,-2.6367,0.1487
Exemplar Similarity x Category Distinctiveness,18.717,5.7923,3.2313,0.0012,7.3637,30.070
Total Sales,0.0002,5.248e-05,3.1391,0.0017,6.187e-05,0.0003
Firm Size,0.0022,0.0092,0.2429,0.8081,-0.0157,0.0202
Market Share,-0.7251,1.8541,-0.3911,0.6957,-4.3592,2.9090
EPS,0.0664,0.0141,4.6927,0.0000,0.0387,0.0941


In [14]:
interaction_term = 'Exemplar Similarity x Category Distinctiveness'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Category Distinctiveness']
regressors = iv + moderators + [interaction_term] + controls

model6 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res6 = model6.fit(cov_type='clustered', cluster_entity=True)
res6

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0491
Estimator:,PanelOLS,R-squared (Between):,0.0323
No. Observations:,33478,R-squared (Within):,0.0491
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.0481
Time:,15:38:00,Log-likelihood,-1.4e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,53.578
Entities:,6496,P-value,0.0000
Avg Obs:,5.1536,Distribution:,"F(26,26956)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-1.1413,0.9408,-1.2131,0.2251,-2.9854,0.7028
Category Coherence,-0.9147,0.4295,-2.1297,0.0332,-1.7565,-0.0728
Category Distinctiveness,0.5774,0.9731,0.5934,0.5529,-1.3299,2.4847
Exemplar Typicality,0.0762,0.1324,0.5753,0.5651,-0.1833,0.3356
Exemplar Similarity x Category Distinctiveness,-0.9869,1.1028,-0.8949,0.3709,-3.1485,1.1747
Total Sales,-6e-06,2.939e-06,-2.0412,0.0412,-1.176e-05,-2.387e-07
Firm Size,-0.0025,0.0011,-2.1419,0.0322,-0.0047,-0.0002
Market Share,-0.3117,0.1596,-1.9527,0.0509,-0.6247,0.0012
EPS,0.0143,0.0020,7.2113,0.0000,0.0104,0.0182


## Moderation Effect of Exemplar Typicality

In [15]:
# Regressions with interaction terms - analyst coverage and category coherence
interaction_term = 'Exemplar Similarity x Exemplar Typicality'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Exemplar Typicality']
regressors = iv + moderators + [interaction_term] + controls

model7 = PanelOLS(reg_data['Analyst Coverage'], reg_data[regressors], entity_effects=True)
res7 = model7.fit(cov_type='clustered', cluster_entity=True)
res7

0,1,2,3
Dep. Variable:,Analyst Coverage,R-squared:,0.1237
Estimator:,PanelOLS,R-squared (Between):,0.1988
No. Observations:,50791,R-squared (Within):,0.1237
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.1694
Time:,15:38:01,Log-likelihood,-1.151e+05
Cov. Estimator:,Clustered,,
,,F-statistic:,230.54
Entities:,8293,P-value,0.0000
Avg Obs:,6.1246,Distribution:,"F(26,42472)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,-15.471,5.1591,-2.9988,0.0027,-25.583,-5.3590
Category Coherence,14.219,2.4942,5.7008,0.0000,9.3305,19.108
Category Distinctiveness,2.5307,1.3660,1.8526,0.0639,-0.1467,5.2082
Exemplar Typicality,-18.405,4.6653,-3.9450,0.0001,-27.549,-9.2605
Exemplar Similarity x Exemplar Typicality,21.191,5.9573,3.5572,0.0004,9.5150,32.868
Total Sales,0.0002,5.227e-05,3.1150,0.0018,6.037e-05,0.0003
Firm Size,0.0023,0.0091,0.2570,0.7972,-0.0156,0.0203
Market Share,-0.7550,1.8517,-0.4077,0.6835,-4.3843,2.8743
EPS,0.0668,0.0141,4.7287,0.0000,0.0391,0.0944


In [16]:
interaction_term = 'Exemplar Similarity x Exemplar Typicality'
reg_data[interaction_term] = reg_data['Exemplar Similarity'] * reg_data['Exemplar Typicality']
regressors = iv + moderators + [interaction_term] + controls

model8 = PanelOLS(reg_data['Analyst Recoms'], reg_data[regressors], entity_effects=True)
res8 = model8.fit(cov_type='clustered', cluster_entity=True)
res8

0,1,2,3
Dep. Variable:,Analyst Recoms,R-squared:,0.0496
Estimator:,PanelOLS,R-squared (Between):,0.0383
No. Observations:,33478,R-squared (Within):,0.0496
Date:,"Tue, Jun 20 2023",R-squared (Overall):,0.0532
Time:,15:38:01,Log-likelihood,-1.399e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,54.127
Entities:,6496,P-value,0.0000
Avg Obs:,5.1536,Distribution:,"F(26,26956)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Exemplar Similarity,2.2688,0.8213,2.7624,0.0057,0.6590,3.8787
Category Coherence,-0.9199,0.4294,-2.1422,0.0322,-1.7616,-0.0782
Category Distinctiveness,-0.2567,0.2130,-1.2053,0.2281,-0.6742,0.1608
Exemplar Typicality,2.4423,0.7755,3.1491,0.0016,0.9222,3.9624
Exemplar Similarity x Exemplar Typicality,-2.9252,0.9347,-3.1297,0.0018,-4.7572,-1.0932
Total Sales,-5.836e-06,2.923e-06,-1.9967,0.0459,-1.156e-05,-1.072e-07
Firm Size,-0.0025,0.0012,-2.1526,0.0314,-0.0047,-0.0002
Market Share,-0.3130,0.1603,-1.9521,0.0509,-0.6272,0.0013
EPS,0.0143,0.0020,7.1933,0.0000,0.0104,0.0181


# Output regression results

In [17]:
def create_table(*results):
    table = pd.DataFrame()
    for i, result in enumerate(results, start=1):
        coefficients = result.params
        pvalues = result.pvalues
        nobs = result.nobs
        rsquared = "{:.3f}".format(result.rsquared)
        nentities = result._entity_info['total']

        # Create a DataFrame for this result
        data = {}
        for key in coefficients.keys():
            data[key] = ["{:.3f}".format(coefficients[key])]
            data[key + ' p-val'] = ["'({:.3f})".format(pvalues[key])]  # added single quote
        data['Observations'] = [nobs]
        data['R-squared'] = [rsquared]
        data['Entities'] = [nentities]
        df = pd.DataFrame(data)

        # Assign model name to the index
        df.index = ['Model' + str(i)]

        # Concatenate to the final table
        table = pd.concat([table, df], axis=0)
    return table

In [18]:

# Now create the table with your results:
table = create_table(res1, res2, res3, res4, res5, res6, res7, res8)


In [19]:
table.T.to_csv(f'../data/tables/main_regression_results_sich4.csv')