In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
import statsmodels.stats.weightstats as smw
from statsmodels.stats.weightstats import ttest_ind
from linearmodels.panel import PanelOLS
from scipy import stats
import warnings
warnings.filterwarnings("ignore")

In [2]:
# 读取数据
stock_data = pd.read_csv('stock_daily_sentiment.csv')
industry_data = pd.read_csv('industry_daily_sentiment.csv')

In [3]:
industry_data

Unnamed: 0,board_code,date,ind_avg_sentiment,ind_sentiment_std,ind_positive_ratio,ind_negative_ratio,ind_sentiment_net,ind_sentiment_consensus,ind_avg_turnover,ind_total_volume,...,idx_low,idx_volume,idx_amount,idx_pct_change,ind_ma_3d,ind_sentiment_change_3d,ind_ma_5d,ind_sentiment_change_5d,ind_ma_10d,ind_sentiment_change_10d
0,801050,2025-02-05,-0.110496,0.176629,0.441620,0.558380,-0.116759,0.874574,1.176667,9066180,...,4533.36,40.079357,442.127555,0.000000,-0.110496,,-0.110496,,-0.110496,
1,801050,2025-02-06,-0.116996,0.189669,0.439995,0.560005,-0.120010,0.860451,1.749333,11145020,...,4553.19,44.750293,509.973515,1.151063,-0.113746,,-0.113746,,-0.113746,
2,801050,2025-02-07,-0.140405,0.172505,0.428817,0.571183,-0.142367,0.857234,1.897333,13626652,...,4587.45,52.554790,562.110154,1.176106,-0.122632,,-0.122632,,-0.122632,
3,801050,2025-02-10,-0.116963,0.131466,0.443004,0.556996,-0.113992,0.899796,1.537333,12210716,...,4662.95,45.103304,494.171590,0.502501,-0.124788,0.058528,-0.121215,,-0.121215,
4,801050,2025-02-11,-0.153395,0.185951,0.420758,0.579242,-0.158483,0.853873,1.544667,11342894,...,4695.48,47.896653,536.184461,0.205451,-0.136921,0.311115,-0.127651,,-0.127651,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,801780,2025-02-24,-0.017175,0.181909,0.500913,0.499087,0.001825,0.909534,0.390667,20424908,...,3865.03,33.995221,246.117136,-0.743855,-0.017131,-0.380363,-0.017468,-1.546275,0.033731,-10.898299
140,801780,2025-02-25,-0.012892,0.140703,0.496941,0.503059,-0.006119,0.916287,0.377333,18709865,...,3831.72,31.602787,227.173238,-0.798272,-0.005096,-0.736883,-0.018401,0.567289,0.024418,-1.160686
141,801780,2025-02-26,-0.084285,0.130792,0.455872,0.544128,-0.088256,0.903550,0.355333,17974194,...,3842.63,30.344388,215.027565,0.517620,-0.038118,-6.703421,-0.029714,2.040763,0.008215,-2.084039
142,801780,2025-02-27,-0.079052,0.142055,0.457067,0.542933,-0.085867,0.896292,0.374000,19740508,...,3838.92,33.358219,245.529052,1.287517,-0.058743,3.602634,-0.035725,0.613405,-0.014671,-1.527696


In [4]:
data = industry_data
# 首先将date列转换为datetime格式
data['date'] = pd.to_datetime(data['date'])
# 然后再设置索引进行回归
data1 = data.set_index(['board_code','date'])

In [5]:
All_Y = data1[['idx_close','idx_open','idx_high','idx_low','idx_volume','idx_amount','idx_pct_change',
               'ind_avg_turnover','ind_total_volume','ind_total_amount']]
All_X = data1[['ind_avg_sentiment','ind_sentiment_std','ind_positive_ratio','ind_negative_ratio','ind_sentiment_net',
               'ind_sentiment_consensus','stock_count','total_comments','sentiment_dispersion','ind_ma_3d',
               'ind_sentiment_change_3d','ind_ma_5d','ind_sentiment_change_5d','ind_ma_10d','ind_sentiment_change_10d']]
Sel_Y = data1[['idx_low','idx_volume','idx_amount','idx_pct_change',
               'ind_avg_turnover','ind_total_volume','ind_total_amount']]
Sel_X = data1[['ind_avg_sentiment','ind_sentiment_std',
               'ind_sentiment_consensus','stock_count','total_comments','sentiment_dispersion','ind_ma_3d',
               'ind_sentiment_change_3d']]

# 计算相关性矩阵
corr_matrix = Sel_Y.corr()

# 找出高度相关的变量对(相关系数>0.8)
high_corr = np.where(np.abs(corr_matrix) > 0.8)
high_corr = [(corr_matrix.index[x], corr_matrix.columns[y], corr_matrix.iloc[x, y]) 
             for x, y in zip(*high_corr) if x != y and x < y]

# 查看高相关性的变量对
for var1, var2, corr in high_corr:
    print(f"{var1} - {var2}: {corr:.3f}")

idx_volume - idx_amount: 0.976
idx_volume - ind_avg_turnover: 0.887
idx_volume - ind_total_amount: 0.914
idx_amount - ind_avg_turnover: 0.848
idx_amount - ind_total_amount: 0.937
ind_avg_turnover - ind_total_amount: 0.883


In [7]:
data = industry_data
# 首先将date列转换为datetime格式
data['date'] = pd.to_datetime(data['date'])
# 然后再设置索引进行回归
data1 = data.set_index(['board_code','date'])
y1 = data1['idx_low']
X1 = data1[['ind_avg_sentiment','ind_sentiment_std',
               'ind_sentiment_consensus','stock_count','total_comments','sentiment_dispersion','ind_ma_3d',
               'ind_sentiment_change_3d']]
            
res1 = PanelOLS(y1, sm.add_constant(X1), entity_effects=True, time_effects=True).fit()
print(res1)

                          PanelOLS Estimation Summary                           
Dep. Variable:                idx_low   R-squared:                        0.2443
Estimator:                   PanelOLS   R-squared (Between):              0.0187
No. Observations:                 120   R-squared (Within):               0.0482
Date:                Fri, Mar 14 2025   R-squared (Overall):              0.0187
Time:                        20:24:44   Log-likelihood                   -697.19
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      3.6372
Entities:                           8   P-value                           0.0010
Avg Obs:                       15.000   Distribution:                    F(8,90)
Min Obs:                       15.000                                           
Max Obs:                       15.000   F-statistic (robust):             3.6372
                            

In [11]:
# 首先将date列转换为datetime格式
data['date'] = pd.to_datetime(data['date'])

# 设置索引
data1 = data.set_index(['board_code','date'])

# 定义因变量列表
y_vars = ['idx_close','idx_volume','idx_amount','idx_pct_change',
               'ind_avg_turnover','ind_total_volume','ind_total_amount']

# 定义自变量
X_vars = ['ind_avg_sentiment','ind_sentiment_std',
               'ind_sentiment_consensus','stock_count','total_comments','sentiment_dispersion','ind_ma_3d',
               'ind_sentiment_change_3d']

# 存储结果的字典
regression_results = {}

# 批量回归
for y_var in y_vars:
    # 进行回归
    y = data1[y_var]
    X = data1[X_vars]
    model = PanelOLS(y, sm.add_constant(X), 
                     entity_effects=True, 
                     time_effects=True)
    results = model.fit()
    
    # 存储结果
    regression_results[y_var] = {
        'R-squared': results.rsquared,
        'R-squared Within': results.rsquared_within,
        'R-squared Between': results.rsquared_between,
        'R-squared Overall': results.rsquared_overall,
        'N_obs': results.nobs,
        'Parameters': results.params,
        't-stats': results.tstats,
        'P-values': results.pvalues,
        'F-statistic': results.f_statistic.stat
    }

# 打印结果摘要
for y_var, results in regression_results.items():
    print(f"\n=== 因变量: {y_var} ===")
    print(f"R-squared: {results['R-squared']:.4f}")
    print(f"R-squared Within: {results['R-squared Within']:.4f}")
    print(f"R-squared Between: {results['R-squared Between']:.4f}")
    print(f"R-squared Overall: {results['R-squared Overall']:.4f}")
    print(f"观测数: {results['N_obs']}")
    print(f"F-statistic: {results['F-statistic']:.4f}")
    print("\n系数估计:")
    for var, coef, tstat, pval in zip(
        results['Parameters'].index,
        results['Parameters'],
        results['t-stats'],
        results['P-values']
    ):
        print(f"{var:20} {coef:10.4f} (t={tstat:8.4f}) p={pval:.4f}")

# 将结果整理成DataFrame
summary_df = pd.DataFrame({
    'Y Variable': [],
    'R-squared': [],
    'R-squared Within': [],
    'R-squared Between': [],
    'R-squared Overall': [],
    'N_obs': [],
    'F-statistic': []
})

for y_var, results in regression_results.items():
    summary_df = pd.concat([summary_df, pd.DataFrame({
        'Y Variable': [y_var],
        'R-squared': [results['R-squared']],
        'R-squared Within': [results['R-squared Within']],
        'R-squared Between': [results['R-squared Between']],
        'R-squared Overall': [results['R-squared Overall']],
        'N_obs': [results['N_obs']],
        'F-statistic': [results['F-statistic']]
    })], ignore_index=True)

print("\n=== 回归结果汇总 ===")
print(summary_df)

# 导出到Excel
with pd.ExcelWriter('panel_regression_results.xlsx') as writer:
    summary_df.to_excel(writer, sheet_name='Summary', index=False)
    
    # 每个回归的详细结果
    for y_var, results in regression_results.items():
        pd.DataFrame({
            'Coefficient': results['Parameters'],
            't-statistic': results['t-stats'],
            'P-value': results['P-values']
        }).to_excel(writer, sheet_name=f'{y_var[:30]}')



=== 因变量: idx_close ===
R-squared: 0.2998
R-squared Within: 0.1297
R-squared Between: 0.0191
R-squared Overall: 0.0192
观测数: 120
F-statistic: 4.8177

系数估计:
const                 3485.1593 (t=  2.7441) p=0.0073
ind_avg_sentiment     -156.8414 (t= -0.4362) p=0.6638
ind_sentiment_std      641.9775 (t=  1.4392) p=0.1536
ind_sentiment_consensus  1264.7155 (t=  1.5297) p=0.1296
stock_count            136.4880 (t=  1.9678) p=0.0522
total_comments           0.0167 (t=  0.9092) p=0.3657
sentiment_dispersion     0.6379 (t=  0.6091) p=0.5440
ind_ma_3d             1045.5893 (t=  2.8360) p=0.0056
ind_sentiment_change_3d     0.7626 (t=  0.5557) p=0.5798

=== 因变量: idx_volume ===
R-squared: 0.4411
R-squared Within: 0.3834
R-squared Between: 0.2611
R-squared Overall: 0.2668
观测数: 120
F-statistic: 8.8781

系数估计:
const                   81.6492 (t=  0.7865) p=0.4336
ind_avg_sentiment       28.3120 (t=  0.9633) p=0.3380
ind_sentiment_std      -38.8931 (t= -1.0668) p=0.2889
ind_sentiment_consensus     4.6393 