In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
from statsmodels.tsa.stattools import grangercausalitytests
from linearmodels import PanelOLS, RandomEffects
from linearmodels.panel import compare
from scipy import stats
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv('stock_daily_sentiment.csv')
df['date'] = pd.to_datetime(df['date'])
df['stock_code'] = df['stock_code'].astype(str).str.zfill(6)

df = df.set_index(['stock_code', 'date'])

# 定义因变量和自变量
y = df['forward_ret_1d']  # 可以改为forward_ret_3d或forward_ret_5d
X = df[['avg_sentiment', 'sentiment_std', 
             'avg_intensity', 'comment_count',
             'close', 'volume', 'amount', 'amplitude',
             'pct_change', 'price_change', 'turnover_rate']]
X = sm.add_constant(X)

# 1. 固定效应模型
fixed_effects = PanelOLS(y, X, entity_effects=True, time_effects=True)
fe_results = fixed_effects.fit(cov_type='clustered', cluster_entity=True)
print("固定效应模型结果:")
print(fe_results)

# 2. 随机效应模型
random_effects = RandomEffects(y, X)
re_results = random_effects.fit(cov_type='clustered', cluster_entity=True)
print("\n随机效应模型结果:")
print(re_results)

comparison = compare({'FE': fe_results, 'RE': re_results}, stars = True)
print(comparison)

# 3. Hausman检验
def hausman_test(fe_model, re_model):
    # 提取系数和协方差矩阵
    b_fe = fe_model.params
    b_re = re_model.params
    
    # 去除常数项后的系数差
    b_diff = b_fe - b_re
    
    # 计算协方差矩阵差
    var_fe = fe_model.cov
    var_re = re_model.cov
    var_diff = var_fe - var_re
    
    # Hausman统计量
    H = np.dot(np.dot(b_diff.T, np.linalg.inv(var_diff)), b_diff)
    
    # 自由度(参数个数)
    df = len(b_fe)
    
    # p值
    p_value = 1 - stats.chi2.cdf(H, df)
    
    return H, p_value

H_stat, p_val = hausman_test(fe_results, re_results)
print("\nHausman检验结果:")
print(f"统计量: {H_stat:.4f}")
print(f"P值: {p_val:.4f}")
print("结论:", "拒绝随机效应，使用固定效应" if p_val < 0.05 else "接受随机效应")

# 输出回归结果到Excel（可选）
def export_results(results, filename):
    # 获取置信区间DataFrame
    conf_int = results.conf_int()
    
    # 创建摘要DataFrame
    summary_df = pd.DataFrame({
        'coefficient': results.params,
        'std_error': results.std_errors,
        't_stat': results.tstats,
        'p_value': results.pvalues,
        'conf_int_lower': conf_int.iloc[:, 0],  # 使用iloc正确获取第一列
        'conf_int_upper': conf_int.iloc[:, 1]   # 使用iloc正确获取第二列
    })
    summary_df.to_excel(filename)
# 可以添加行业固定效应
# 如果需要控制行业效应，在数据中加入行业虚拟变量
# 添加行业虚拟变量，但删除一个类别避免完全共线性
# 按市值大小分组回归
for group in df.groupby('industry'):
    industry_data = group[1]
    y = industry_data['forward_ret_1d']
    X = industry_data[['avg_sentiment', 'sentiment_std', 
             'avg_intensity', 'comment_count',
             'close', 'volume', 'amount', 'amplitude',
             'pct_change', 'price_change', 'turnover_rate']]
    X = sm.add_constant(X)
    
    fixed_effects = PanelOLS(y, X, entity_effects=True, time_effects=True)
    fe_results = fixed_effects.fit(cov_type='clustered', cluster_entity=True)
    print(f"\n{group[0]}行业的回归结果:")
    print(fe_results)
    export_results(fe_results, f'{group[0]}fixed_effects_results.xlsx')



export_results(re_results, 'random_effects_results.xlsx')


固定效应模型结果:
                          PanelOLS Estimation Summary                           
Dep. Variable:         forward_ret_1d   R-squared:                        0.0594
Estimator:                   PanelOLS   R-squared (Between):             -480.88
No. Observations:                1989   R-squared (Within):               0.0738
Date:                Sun, Mar 16 2025   R-squared (Overall):             -23.929
Time:                        00:47:47   Log-likelihood                    4821.6
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      10.627
Entities:                         111   P-value                           0.0000
Avg Obs:                       17.919   Distribution:                 F(11,1850)
Min Obs:                       12.000                                           
Max Obs:                       18.000   F-statistic (robust):             4.7320
                  


房地产行业的回归结果:
                          PanelOLS Estimation Summary                           
Dep. Variable:         forward_ret_1d   R-squared:                        0.1629
Estimator:                   PanelOLS   R-squared (Between):             -2113.4
No. Observations:                 198   R-squared (Within):               0.1641
Date:                Sun, Mar 16 2025   R-squared (Overall):             -34.724
Time:                        00:47:48   Log-likelihood                    546.26
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      2.8137
Entities:                          78   P-value                           0.0022
Avg Obs:                       2.5385   Distribution:                  F(11,159)
Min Obs:                       0.0000                                           
Max Obs:                       18.000   F-statistic (robust):          1.566e+14
               


计算机行业的回归结果:
                          PanelOLS Estimation Summary                           
Dep. Variable:         forward_ret_1d   R-squared:                        0.1548
Estimator:                   PanelOLS   R-squared (Between):             -125.08
No. Observations:                 246   R-squared (Within):               0.1428
Date:                Sun, Mar 16 2025   R-squared (Overall):             -7.6512
Time:                        00:47:48   Log-likelihood                    547.63
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      3.3955
Entities:                         111   P-value                           0.0002
Avg Obs:                       2.2162   Distribution:                  F(11,204)
Min Obs:                       0.0000                                           
Max Obs:                       18.000   F-statistic (robust):             79.066
               