In [None]:
import numpy as np
import pandas
import shap
import matplotlib.pyplot as plt
import xgboost as xgb
import seaborn as sns
from spsspro.algorithm import supervised_learning
import alepython
from alepython import ale_plot
import PyALE
from PyALE import ale
import random

In [None]:
#生成案例数据
data = pandas.read_csv("0416数据 1%.CSV")
data_y = data.iloc[:, 0]
data_x = data.iloc[:, 1:]
#xgboost分类，输入参数详细可以光标放置函数括号内按shift+tab查看，输出结果参考spsspro模板分析报告
result = supervised_learning.xgboost_regression(data_x=data_x, data_y=data_y)

In [None]:
# SHAP分析
explainer = shap.TreeExplainer(result['model'])
shap_values = explainer.shap_values(data_x)
shap_values2 = explainer(data_x)
print(shap_values)
print(shap_values2)
shap.summary_plot(shap_values, data_x, feature_names=data_x.columns, plot_type="bar")
plt.show()
#数值展示
feature_importance = pandas.DataFrame()

feature_importance['feature'] = data_x.columns

feature_importance['importance'] = np.abs(shap_values).mean(0)

feature_importance.sort_values('importance', ascending=False)

print(feature_importance)

# 可视化
shap.summary_plot(shap_values, data_x, feature_names=data_x.columns, plot_type="dot", show=False)
plt.savefig('shap_all1%.png',dpi=600)
#局部以及全局图
shap.plots.bar(shap_values2[1], show_data=True)
shap.plots.bar(shap_values2, show_data=True)

In [None]:
#队列条形图
sex = ["Women" if shap_values2[i,"gender"].data == 0 
       else "Men" for i in range(shap_values2.shape[0])]
shap.plots.bar(shap_values2.cohorts(sex).abs.mean(0))
# 假设 shap_values2 是 Explanation 对象，且 "age" 是特征列
year = ["90s" if shap_values2[i,"age"].data <= 32 
        else "70s" if shap_values2[i,"age"].data >= 43
       else "80s" for i in range(shap_values2.shape[0])]
shap.plots.bar(shap_values2.cohorts(year).abs.mean(0))

In [None]:
#依赖图
shap.dependence_plot('reg_mobility', shap_values, data_x, interaction_index=None)
shap.dependence_plot('reg_econ', shap_values, data_x, interaction_index=None)
shap.dependence_plot('work_hour', shap_values, data_x, interaction_index='fam_income')
shap.dependence_plot('reg_eduratio', shap_values, data_x, interaction_index=None)
shap.dependence_plot('education', shap_values, data_x, interaction_index=None)
shap.dependence_plot('reg_medicare', shap_values, data_x, interaction_index=None)
shap.dependence_plot('reg_childcare', shap_values, data_x, interaction_index=None)
shap.dependence_plot('internet', shap_values, data_x, interaction_index=None)
shap.dependence_plot('insurance', shap_values, data_x, interaction_index=None)

In [None]:
#交互图
shap_interaction_values = explainer.shap_interaction_values(data_x)
shap.summary_plot(shap_interaction_values, data_x)
print(result)

In [None]:
#ale图
plt.rc("figure", figsize=(9,6))
ale_plot(result['model'], data_x, 'age', monte_carlo=True)
ale_plot(result['model'], data_x, 'reg_mobility', monte_carlo=True)
ale_plot(model=result['model'], train_set=data_x, features=["age","reg_mobility"], bins=10, monte_carlo=True)

In [None]:
#一维线性
ale_plot(model=result['model'], train_set=data_x, features=["fam_size"], bins=10, monte_carlo=True)
ale_plot(model=result['model'], train_set=data_x, features=["reg_svc"], bins=10, monte_carlo=True)
ale_plot(model=result['model'], train_set=data_x, features=["age"], bins=10, monte_carlo=True)
ale_plot(model=result['model'], train_set=data_x, features=["reg_urgap"], bins=10, monte_carlo=True)
ale_plot(model=result['model'], train_set=data_x, features=["reg_mobility"], bins=10, monte_carlo=True)
ale_plot(model=result['model'], train_set=data_x, features=["reg_econ"], bins=10, monte_carlo=True)
ale_plot(model=result['model'], train_set=data_x, features=["reg_eduratio"], bins=10, monte_carlo=True)
ale_plot(model=result['model'], train_set=data_x, features=["fam_income"], bins=10, monte_carlo=True)
ale_plot(model=result['model'], train_set=data_x, features=["education"], bins=10, monte_carlo=True)
ale_plot(model=result['model'], train_set=data_x, features=["work_hour"], bins=10, monte_carlo=True)
ale_plot(model=result['model'], train_set=data_x, features=["internet"], bins=10, monte_carlo=True)
ale_plot(model=result['model'], train_set=data_x, features=["social_trust"], bins=10, monte_carlo=True)
ale_plot(model=result['model'], train_set=data_x, features=["insurance"], bins=10, monte_carlo=True)

In [None]:
#交叉图
plt.rc("figure", figsize=(16,9))
ale_plot(model=result['model'], train_set=data_x, features=["age","education"], bins=10, monte_carlo=True)
ale_plot(model=result['model'], train_set=data_x, features=["fam_income_rel","work_hour"], bins=10, monte_carlo=True)
ale_plot(model=result['model'], train_set=data_x, features=["reg_mobility","fam_income"], bins=10, monte_carlo=True)

In [None]:
#ale图
## 1D - continuous - no CI
ale_eff = ale(
    X=data_x,model=result['model'], feature=["reg_mobility"], grid_size=50, include_CI=False
)
plt.savefig('reg_mobility.png',dpi=600)
ale_eff = ale(
    X=data_x,model=result['model'], feature=["fam_size"], grid_size=50, include_CI=False
)
plt.savefig('fam_size.png',dpi=600)
ale_eff = ale(
    X=data_x,model=result['model'], feature=["reg_svc"], grid_size=50, include_CI=False
)
plt.savefig('reg_svc',dpi=600)
ale_eff = ale(
    X=data_x,model=result['model'], feature=["age"], grid_size=50, include_CI=False
)
plt.savefig('age.png',dpi=600)
ale_eff = ale(
    X=data_x,model=result['model'], feature=["reg_urgap"], grid_size=50, include_CI=False
)
plt.savefig('reg_urgap.png',dpi=600)
ale_eff = ale(
    X=data_x,model=result['model'], feature=["reg_econ"], grid_size=50, include_CI=False
)
plt.savefig('reg_econ.png',dpi=600)

In [None]:
## 1D - continuous - with 95% CI
random.seed(123)
X_sample = data_x.loc[random.sample(data_x.index.to_list(), 1000), :]
ale_eff = ale(
    X=X_sample, model=result['model'], feature=["reg_mobility"], grid_size=50, include_CI=True, C=0.95
)
ale_eff = ale(
    X=X_sample, model=result['model'], feature=["reg_eduratio"], grid_size=50, include_CI=True, C=0.95
)
ale_eff = ale(
    X=X_sample, model=result['model'], feature=["social_trust"], grid_size=50, include_CI=True, C=0.95
)
ale_eff = ale(
    X=X_sample, model=result['model'], feature=["work_hour"], grid_size=50, include_CI=True, C=0.95
)
ale_eff = ale(
    X=X_sample, model=result['model'], feature=["internet"], grid_size=50, include_CI=True, C=0.95
)

In [None]:
#ale图
#离散ale1D
## 1D - discrete
ale_eff = ale(X=data_x, model=result['model'], feature=["education"])
ale_eff = ale(X=data_x, model=result["model"], feature=["insurance"])

In [None]:
#2Dale图
ale_eff = ale(X=data_x,model=result['model'], feature=["age","education"], grid_size=100)
ale_eff = ale(X=data_x,model=result['model'], feature=["work_hour","fam_income_rel"], grid_size=100)