In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
from statsmodels.tsa.stattools import grangercausalitytests
from linearmodels import PanelOLS, RandomEffects
from linearmodels.panel import compare
from scipy import stats
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv('stock_daily_sentiment.csv')
df['date'] = pd.to_datetime(df['date'])
df['stock_code'] = df['stock_code'].astype(str).str.zfill(6)

df = df.set_index(['stock_code', 'date'])

In [3]:
All_Y = df[['open','close','high','low','volume','amount','amplitude','pct_change',
               'price_change','turnover_rate','forward_ret_1d','forward_ret_3d','forward_ret_5d']]
All_X = df[['avg_sentiment','sentiment_std','positive_ratio','negative_ratio',
            'avg_intensity','comment_count','avg_positive_prob','avg_negative_prob',
            'sentiment_net','sentiment_consensus','ma_3d','std_3d',
            'sentiment_change_3d','ma_5d','std_5d','sentiment_change_5d',
            'ma_10d','std_10d','sentiment_change_10d']]
Sel_Y = df[['close','volume','amount','amplitude','pct_change',
               'price_change','turnover_rate','forward_ret_1d','forward_ret_3d','forward_ret_5d']]
Sel_X = df[['avg_sentiment'#,'positive_ratio'
               ,'sentiment_std',
            'avg_intensity','comment_count',
            'sentiment_consensus','ma_3d','std_3d',
            'sentiment_change_3d']]

# 计算相关性矩阵
corr_matrix = All_X.corr()

# 找出高度相关的变量对(相关系数>0.8)
high_corr = np.where(np.abs(corr_matrix) > 0.8)
high_corr = [(corr_matrix.index[x], corr_matrix.columns[y], corr_matrix.iloc[x, y]) 
             for x, y in zip(*high_corr) if x != y and x < y]

# 查看高相关性的变量对
for var1, var2, corr in high_corr:
    print(f"{var1} - {var2}: {corr:.3f}")

avg_sentiment - positive_ratio: 0.992
avg_sentiment - negative_ratio: -0.992
avg_sentiment - avg_positive_prob: 1.000
avg_sentiment - avg_negative_prob: -1.000
avg_sentiment - sentiment_net: 0.992
positive_ratio - negative_ratio: -1.000
positive_ratio - avg_positive_prob: 0.992
positive_ratio - avg_negative_prob: -0.992
positive_ratio - sentiment_net: 1.000
negative_ratio - avg_positive_prob: -0.992
negative_ratio - avg_negative_prob: 0.992
negative_ratio - sentiment_net: -1.000
avg_positive_prob - avg_negative_prob: -1.000
avg_positive_prob - sentiment_net: 0.992
avg_negative_prob - sentiment_net: -0.992
ma_3d - ma_5d: 0.929
ma_3d - ma_10d: 0.866
std_3d - std_5d: 0.816
ma_5d - ma_10d: 0.940
std_5d - std_10d: 0.884


In [5]:
y1 = df['forward_ret_1d']
X1 = df[['avg_sentiment','positive_ratio','sentiment_std','comment_count',
            'sentiment_consensus','ma_3d','std_3d']]
            
res1 = PanelOLS(y1, sm.add_constant(X1), entity_effects=True, time_effects=True).fit()
print(res1)

                          PanelOLS Estimation Summary                           
Dep. Variable:         forward_ret_1d   R-squared:                        0.0103
Estimator:                   PanelOLS   R-squared (Between):             -0.3414
No. Observations:                1879   R-squared (Within):               0.0086
Date:                Sun, Mar 16 2025   R-squared (Overall):             -0.0083
Time:                        15:23:29   Log-likelihood                    4499.4
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      2.5939
Entities:                         111   P-value                           0.0116
Avg Obs:                       16.928   Distribution:                  F(7,1745)
Min Obs:                       11.000                                           
Max Obs:                       17.000   F-statistic (robust):             2.5939
                            

In [11]:
# 首先将date列转换为datetime格式
data['date'] = pd.to_datetime(data['date'])

# 设置索引
data1 = data.set_index(['stock_code','date'])

# 定义因变量列表
y_vars = ['close','volume','amount','amplitude','pct_change',
          'price_change','turnover_rate','forward_ret_1d',
          'forward_ret_3d','forward_ret_5d']

# 定义自变量
X_vars = ['avg_sentiment','sentiment_std','comment_count',
            'sentiment_consensus','ma_3d','std_3d']

# 存储结果的字典
regression_results = {}

# 批量回归
for y_var in y_vars:
    # 进行回归
    y = data1[y_var]
    X = data1[X_vars]
    model = PanelOLS(y, sm.add_constant(X), 
                     entity_effects=True, 
                     time_effects=True)
    results = model.fit()
    
    # 存储结果
    regression_results[y_var] = {
        'R-squared': results.rsquared,
        'R-squared Within': results.rsquared_within,
        'R-squared Between': results.rsquared_between,
        'R-squared Overall': results.rsquared_overall,
        'N_obs': results.nobs,
        'Parameters': results.params,
        't-stats': results.tstats,
        'P-values': results.pvalues,
        'F-statistic': results.f_statistic.stat
    }

# 打印结果摘要
for y_var, results in regression_results.items():
    print(f"\n=== 因变量: {y_var} ===")
    print(f"R-squared: {results['R-squared']:.4f}")
    print(f"R-squared Within: {results['R-squared Within']:.4f}")
    print(f"R-squared Between: {results['R-squared Between']:.4f}")
    print(f"R-squared Overall: {results['R-squared Overall']:.4f}")
    print(f"观测数: {results['N_obs']}")
    print(f"F-statistic: {results['F-statistic']:.4f}")
    print("\n系数估计:")
    for var, coef, tstat, pval in zip(
        results['Parameters'].index,
        results['Parameters'],
        results['t-stats'],
        results['P-values']
    ):
        print(f"{var:20} {coef:10.4f} (t={tstat:8.4f}) p={pval:.4f}")

# 将结果整理成DataFrame
summary_df = pd.DataFrame({
    'Y Variable': [],
    'R-squared': [],
    'R-squared Within': [],
    'R-squared Between': [],
    'R-squared Overall': [],
    'N_obs': [],
    'F-statistic': []
})

for y_var, results in regression_results.items():
    summary_df = pd.concat([summary_df, pd.DataFrame({
        'Y Variable': [y_var],
        'R-squared': [results['R-squared']],
        'R-squared Within': [results['R-squared Within']],
        'R-squared Between': [results['R-squared Between']],
        'R-squared Overall': [results['R-squared Overall']],
        'N_obs': [results['N_obs']],
        'F-statistic': [results['F-statistic']]
    })], ignore_index=True)

print("\n=== 回归结果汇总 ===")
print(summary_df)

# 导出到Excel
with pd.ExcelWriter('panel_regression_results.xlsx') as writer:
    summary_df.to_excel(writer, sheet_name='Summary', index=False)
    
    # 每个回归的详细结果
    for y_var, results in regression_results.items():
        pd.DataFrame({
            'Coefficient': results['Parameters'],
            't-statistic': results['t-stats'],
            'P-value': results['P-values']
        }).to_excel(writer, sheet_name=f'{y_var[:30]}')



=== 因变量: close ===
R-squared: 0.0251
R-squared Within: 0.0091
R-squared Between: -0.0002
R-squared Overall: -0.0003
观测数: 1879
F-statistic: 7.4828

系数估计:
const                   60.9146 (t= 19.6998) p=0.0000
avg_sentiment           -0.6804 (t= -0.7462) p=0.4557
sentiment_std           -1.6225 (t= -0.3817) p=0.7027
comment_count           -0.0043 (t= -3.9101) p=0.0001
sentiment_consensus      1.8667 (t=  0.9193) p=0.3581
ma_3d                    6.0344 (t=  4.3634) p=0.0000
std_3d                   1.8771 (t=  1.2184) p=0.2232

=== 因变量: volume ===
R-squared: 0.2537
R-squared Within: 0.2486
R-squared Between: 0.1610
R-squared Overall: 0.1568
观测数: 1879
F-statistic: 98.9422

系数估计:
const                496921.0190 (t=  2.8377) p=0.0046
avg_sentiment        133915.5635 (t=  2.5933) p=0.0096
sentiment_std        -625448.5532 (t= -2.5982) p=0.0095
comment_count         1375.1300 (t= 22.2734) p=0.0000
sentiment_consensus  596138.7817 (t=  5.1841) p=0.0000
ma_3d                104134.1859 (t=  1