In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score#导入交叉验证模块
from sklearn.ensemble import RandomForestRegressor#导入随机森林回归系
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, roc_curve, auc
from sklearn.preprocessing import MinMaxScaler
import shap

In [2]:
# 读入数据集
data = pd.read_csv("D:\pythonProject7\库存合并2.csv")

# 将第 2 到 24 列作为特征，第 1 列作为标签
X = data.iloc[:, 1:24].values
y = data.iloc[:, 0].values

# 最大最小值归一化
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# 将数据集分为训练集和测试集
# 划分数据集为训练集、验证集和测试集
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.2, random_state=42)


In [3]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
# 拟合模型
rf_model.fit(X_train, y_train)

# 预测训练集、验证集和测试集
y_train_pred = rf_model.predict(X_train)
y_val_pred = rf_model.predict(X_val)
y_test_pred = rf_model.predict(X_test)



In [4]:
# 计算R方指标
r2_train = r2_score(y_train, y_train_pred)
r2_val = r2_score(y_val, y_val_pred)
r2_test = r2_score(y_test, y_test_pred)

print('训练集 R方:', r2_train)
print('验证集 R方:', r2_val)
print('测试集 R方:', r2_test)

训练集 R方: 0.999814908984557
验证集 R方: 0.99885731966493
测试集 R方: 0.9985093820130309


In [None]:
# 我们记录在文件中的数据的特征参数名称
columns_name_list = [ 'WMHD/1000', 'BETAP', 'PLHI2/1000', 'PNBI1LSOURCE/1000', 'PNBI1RSOURCE/1000', 'PNBI2LSOURCE/1000', 'PNBI2RSOURCE/1000', 'G109', 'G107', 'G106', 'KAPPA', 'AMINOR', 'Q95', 'LI', 'TRITOP', 'TRIBOT', 'DRSEP', 'PXUV33', 'PXUV7', 'WUta', 'pcrl01/1000000', 'pecrh', 'n3/n6']
explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, feature_names=columns_name_list)


In [None]:
importances = rf_model.feature_importances_
# 可视化特征的影响度
feature_names = data.columns[1:24]  # 特征名称
indices = np.argsort(importances)[::-1]
plt.figure()
plt.title("Feature Importances")
plt.bar(range(x_array.shape[1]), importances[indices])
plt.xticks(range(x_array.shape[1]), feature_names[indices], rotation=90)  # 设置 x 轴标签
plt.xlabel("Features")
plt.ylabel("Importance")
plt.show()