In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
from statsmodels.tsa.stattools import grangercausalitytests
from linearmodels import PanelOLS, RandomEffects
from linearmodels.panel import compare
from scipy import stats
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv('stock_daily_sentiment.csv')
df['date'] = pd.to_datetime(df['date'])
df['stock_code'] = df['stock_code'].astype(str).str.zfill(6)


def granger_analysis(data, stock_code, sentiment_vars, return_vars, maxlag=5):
    """
    执行Granger因果检验
    
    参数:
    data: DataFrame, 包含情绪和收益率数据
    stock_code: str, 股票代码（6位）
    sentiment_vars: list, 情绪变量列表
    return_vars: list, 收益率变量列表
    maxlag: int, 最大滞后阶数
    """
    # 筛选特定股票的数据
    stock_data = data[data['stock_code'] == stock_code].sort_values('date')
    
    results = {}
    for sent_var in sentiment_vars:
        for ret_var in return_vars:
            # 准备检验数据
            test_data = pd.DataFrame({
                'sentiment': stock_data[sent_var],
                'return': stock_data[ret_var]
            }).dropna()
            
            if len(test_data) > maxlag:  # 确保数据量足够
                try:
                    # 执行Granger因果检验
                    granger_test = grangercausalitytests(test_data, maxlag=maxlag, verbose=False)
                    
                    # 存储结果
                    test_results = {}
                    for lag in range(1, maxlag + 1):
                        test_results[lag] = {
                            'ssr_ftest_pvalue': granger_test[lag][0]['ssr_chi2test'][1],
                            'ssr_chi2test_pvalue': granger_test[lag][0]['ssr_chi2test'][1]
                        }
                    
                    results[f"{sent_var}->{ret_var}"] = test_results
                except:
                    continue
    
    return results

# 定义要检验的变量
sentiment_variables = ['avg_sentiment', 'sentiment_std', 'positive_ratio', 
                      'negative_ratio', 'avg_intensity']
return_variables = ['forward_ret_1d', 'forward_ret_3d', 'forward_ret_5d']

# 对每个股票进行Granger因果检验
def run_all_stocks_granger(df):
    all_results = {}
    unique_stocks = sorted(df['stock_code'].unique())  # 排序以保证顺序一致
    
    for stock in unique_stocks:
        print(f"Processing stock: {stock}")
        results = granger_analysis(df, stock, sentiment_variables, return_variables)
        all_results[stock] = results
    return all_results

# 执行分析
granger_results = run_all_stocks_granger(df)

# 结果分析和统计
def analyze_results(granger_results, significance_level=0.05):
    significant_counts = {}
    
    for stock, results in granger_results.items():
        for relation, lags in results.items():
            if relation not in significant_counts:
                significant_counts[relation] = 0
            
            # 检查是否在任何滞后阶数下显著
            for lag, values in lags.items():
                if values['ssr_ftest_pvalue'] < significance_level:
                    significant_counts[relation] += 1
                    break
    
    # 计算显著比例
    total_stocks = len(granger_results)
    significance_ratios = {k: v/total_stocks for k, v in significant_counts.items()}
    
    return significance_ratios

# 输出结果统计
significance_ratios = analyze_results(granger_results)
print("\n显著性统计结果：")
for relation, ratio in significance_ratios.items():
    print(f"{relation}: {ratio:.2%} 的股票显示显著的Granger因果关系")

# 按行业统计结果
def analyze_results_by_industry(df, granger_results, significance_level=0.05):
    industry_stats = {}
    
    # 创建股票代码到行业的映射
    stock_industry_map = df[['stock_code', 'industry']].drop_duplicates().set_index('stock_code')['industry']
    
    # 首先确定所有可能的关系类型
    all_relations = set()
    for stock, results in granger_results.items():
        all_relations.update(results.keys())
    
    # 初始化行业统计字典
    for industry in df['industry'].unique():
        industry_stats[industry] = {relation: 0 for relation in all_relations}
    
    # 统计显著结果
    for stock, results in granger_results.items():
        try:
            industry = stock_industry_map[stock]
            
            for relation, lags in results.items():
                # 检查是否在任何滞后阶数下显著
                for lag, values in lags.items():
                    if values['ssr_ftest_pvalue'] < significance_level:
                        industry_stats[industry][relation] += 1
                        break
        except KeyError:
            print(f"警告: 股票 {stock} 未找到对应的行业信息")
            continue
    
    # 计算每个行业的显著比例
    for industry in industry_stats:
        industry_stock_count = len(df[df['industry'] == industry]['stock_code'].unique())
        if industry_stock_count > 0:  # 防止除以零
            industry_stats[industry] = {k: v/industry_stock_count for k, v in industry_stats[industry].items()}
    
    return industry_stats
# 输出结果统计
significance_ratios = analyze_results(granger_results)
print("\n整体显著性统计结果：")
for relation, ratio in significance_ratios.items():
    print(f"{relation}: {ratio:.2%} 的股票显示显著的Granger因果关系")
# 输出行业统计结果
industry_stats = analyze_results_by_industry(df, granger_results)
print("\n行业显著性统计结果：")
for industry, relations in industry_stats.items():
    print(f"\n行业: {industry}")
    for relation, ratio in relations.items():
        print(f"{relation}: {ratio:.2%} 的股票显示显著的Granger因果关系")


Processing stock: 000002
Processing stock: 000069
Processing stock: 000538
Processing stock: 000568
Processing stock: 000630
Processing stock: 000725
Processing stock: 000858
Processing stock: 000878
Processing stock: 000895
Processing stock: 000933
Processing stock: 000960
Processing stock: 000977
Processing stock: 001979
Processing stock: 002049
Processing stock: 002074
Processing stock: 002129
Processing stock: 002138
Processing stock: 002230
Processing stock: 002304
Processing stock: 002340
Processing stock: 002371
Processing stock: 002410
Processing stock: 002415
Processing stock: 002439
Processing stock: 002460
Processing stock: 002463
Processing stock: 002466
Processing stock: 002475
Processing stock: 002507
Processing stock: 002568
Processing stock: 002594
Processing stock: 300014
Processing stock: 300015
Processing stock: 300035
Processing stock: 300059
Processing stock: 300122
Processing stock: 300146
Processing stock: 300223
Processing stock: 300253
Processing stock: 300274
