In [2]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_breuschpagan
from scipy.stats import chisquare
from statsmodels.stats.stattools import durbin_watson
from linearmodels.panel import compare
from linearmodels.panel import PanelOLS
from linearmodels.panel import RandomEffects

# 假设您的数据已经加载到名为 df 的 DataFrame 中

# 数据类型转换（示例）
df = pd.read_csv('stock_daily_sentiment.csv')
df['date'] = pd.to_datetime(df['date'])
df['stock_code'] = df['stock_code'].astype(str).str.zfill(6)

df = df.sort_values(['stock_code', 'date'])

# 设置多重索引 (必须设置)
df = df.set_index(['stock_code', 'date'])

df = df.dropna(subset=['avg_sentiment', 'sentiment_std', 
             'avg_intensity', 'comment_count',
             'close', 'volume', 'amount', 'amplitude',
             'pct_change', 'price_change', 'turnover_rate'])  # 只删除指定列中包含 NaN 的行

# 定义自变量
exog_vars = ['avg_sentiment', 'sentiment_std', 
             'avg_intensity', 'comment_count',
             'close', 'volume', 'amount', 'amplitude',
             'pct_change', 'price_change', 'turnover_rate']
exog = df[exog_vars]

# 定义因变量（例如，1日未来收益）
endog = df['forward_ret_3d']

# 构建随机效应模型
model_re = PanelOLS(endog, sm.add_constant(exog), entity_effects=True, time_effects=True)
re_res = RandomEffects(model_re.dependent, model_re.exog).fit()

# 输出模型结果
print(re_res.summary)


# Hausman 检验 (使用 linearmodels)
formula = 'forward_ret_1d ~ ' + ' + '.join(exog_vars) + ' + EntityEffects'
model_fe_lm = PanelOLS.from_formula(formula, data=df)
fe_res_lm = model_fe_lm.fit()

formula_re = 'forward_ret_1d ~ ' + ' + '.join(exog_vars)
model_re_lm = RandomEffects.from_formula(formula_re, data=df)
re_res_lm = model_re_lm.fit()

comparison = compare({'FE': fe_res_lm, 'RE': re_res_lm}, stars=True)
print(comparison)

# 异方差检验
ols_resid = sm.OLS(endog, sm.add_constant(exog)).fit().resid
het_test = het_breuschpagan(ols_resid, sm.add_constant(exog))
labels = ['LM Statistic', 'LM-Test p-value', 'F-Statistic', 'F-Test p-value']
print(dict(zip(labels, het_test)))

# 序列相关性检验 (简化版)
dw = durbin_watson(re_res.resids)
print("\nDurbin-Watson statistic:", dw)


# 使用聚类稳健标准误
re_res_robust = RandomEffects(model_re.dependent, model_re.exog).fit(cov_type='clustered', cluster_entity=True)
print(re_res_robust.summary)


                        RandomEffects Estimation Summary                        
Dep. Variable:         forward_ret_3d   R-squared:                        0.0242
Estimator:              RandomEffects   R-squared (Between):              0.3194
No. Observations:                1989   R-squared (Within):              -0.0203
Date:                Sun, Mar 16 2025   R-squared (Overall):              0.0242
Time:                        14:31:44   Log-likelihood                    3479.0
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      4.4582
Entities:                         111   P-value                           0.0000
Avg Obs:                       17.919   Distribution:                 F(11,1977)
Min Obs:                       12.000                                           
Max Obs:                       18.000   F-statistic (robust):             4.4582
                            