In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from scipy.stats import ttest_ind
import numpy as np

np.random.seed(42)
filtered_df = pd.read_csv("../processed_data/bond_data_normalized_w_senti.csv")

# 第51列是因变量，第1到50列和第56列是自变量
X_A = filtered_df.iloc[:, 1:50]  # 实验 A 的自变量（第1到第50列）
X_B = filtered_df.iloc[:, list(range(1,50)) + [56]]  # 实验 B 的自变量（第1到第50列加上第56列）
y = filtered_df.iloc[:, 50]  # 因变量（第51列）

# 拆分数据集为训练集和测试集
X_A_train, X_A_test, y_train, y_test = train_test_split(X_A, y, test_size=0.2, random_state=42)
X_B_train, X_B_test = train_test_split(X_B, test_size=0.2, random_state=42)[0:2]  # 同样的拆分比例

# 实验 A：构建随机森林模型
rf_A = RandomForestRegressor(random_state=42)
rf_A.fit(X_A_train, y_train)
y_pred_A = rf_A.predict(X_A_test)

# 实验 B：构建随机森林模型
rf_B = RandomForestRegressor(random_state=42)
rf_B.fit(X_B_train, y_train)
y_pred_B = rf_B.predict(X_B_test)

# 计算性能指标：R²、MSE、MAE、RMSE
r2_A = r2_score(y_test, y_pred_A)
mse_A = mean_squared_error(y_test, y_pred_A)
mae_A = mean_absolute_error(y_test, y_pred_A)
rmse_A = mean_squared_error(y_test, y_pred_A, squared=False)

r2_B = r2_score(y_test, y_pred_B)
mse_B = mean_squared_error(y_test, y_pred_B)
mae_B = mean_absolute_error(y_test, y_pred_B)
rmse_B = mean_squared_error(y_test, y_pred_B, squared=False)

# 比较性能差异
performance_difference = {
    "Metric": ["R²", "MSE", "MAE", "RMSE"],
    "Experiment A": [r2_A, mse_A, mae_A, rmse_A],
    "Experiment B": [r2_B, mse_B, mae_B, rmse_B],
    "Improvement": [r2_B - r2_A, mse_A - mse_B, mae_A - mae_B, rmse_A - rmse_B],
}

performance_df = pd.DataFrame(performance_difference)

# 使用统计学方法（t检验）评估预测效果的显著性
t_stat, p_value = ttest_ind(y_pred_A, y_pred_B)

t_test_results = {
    "T-statistic": [t_stat],
    "P-value": [p_value],
    "Significant": ["Yes" if p_value < 0.05 else "No"],
}
t_test_results_df = pd.DataFrame(t_test_results)

print(performance_df)
print(t_test_results_df)



  Metric  Experiment A  Experiment B  Improvement
0     R²      0.998295      0.998554     0.000259
1    MSE      0.000858      0.000728     0.000130
2    MAE      0.000238      0.000221     0.000017
3   RMSE      0.029296      0.026979     0.002317
   T-statistic   P-value Significant
0    -0.015888  0.987324          No




In [28]:
# 比较性能差异
performance_difference = {
    "Metric": ["R²", "MSE", "MAE", "RMSE"],
    "Experiment A": [r2_A, mse_A, mae_A, rmse_A],
    "Experiment B": [r2_B, mse_B, mae_B, rmse_B],
    "Improvement": [r2_B - r2_A, mse_A - mse_B, mae_A - mae_B, rmse_A - rmse_B],
}

performance_df = pd.DataFrame(performance_difference)

# 使用统计学方法（t检验）评估预测效果的显著性
t_stat, p_value = ttest_ind(y_pred_A, y_pred_B)

t_test_results = {
    "T-statistic": [t_stat],
    "P-value": [p_value],
    "Significant": ["Yes" if p_value < 0.05 else "No"],
}
t_test_results_df = pd.DataFrame(t_test_results)

print(performance_df)
print(t_test_results_df)

# 特征重要性
importance_A = rf_A.feature_importances_
importance_B = rf_B.feature_importances_

# 将特征重要性结果转为 DataFrame
importance_df_A = pd.DataFrame({'Feature': X_A.columns, 'Importance': importance_A}).sort_values(by='Importance', ascending=False)
importance_df_B = pd.DataFrame({'Feature': X_B.columns, 'Importance': importance_B}).sort_values(by='Importance', ascending=False)

# print("实验 A 特征重要性：")
# print(importance_df_A)
print("\n实验 B 特征重要性：")
print(importance_df_B)

  Metric  Experiment A  Experiment B  Improvement
0     R²      0.998295      0.998554     0.000259
1    MSE      0.000858      0.000728     0.000130
2    MAE      0.000238      0.000221     0.000017
3   RMSE      0.029296      0.026979     0.002317
   T-statistic   P-value Significant
0    -0.015888  0.987324          No

实验 B 特征重要性：
          Feature    Importance
48          到期收益率  9.810698e-01
47           剩余期限  6.166056e-03
49      sentiment  2.468057e-03
0      中间价:美元兑人民币  2.209399e-03
1       Shibor:3月  2.030867e-03
8      所属申万一级行业指数  1.250494e-03
40        股东权益周转率  7.946098e-04
38        应收账款周转率  2.952836e-04
5        CPI:当月同比  2.541196e-04
17           流动负债  2.320183e-04
21        经营活动现金流  2.311808e-04
18          非流动负债  2.043878e-04
45         同期国债利率  1.897009e-04
4        PPI:当月同比  1.834739e-04
13           利润总额  1.706855e-04
3   宏观经济景气指数:先行指数  1.698674e-04
42          授信剩余率  1.592414e-04
2          制造业PMI  1.563355e-04
20         股东权益合计  1.372782e-04
16           资产总计  1.33