In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from causalml.inference.meta import XGBTRegressor
from causalml.inference.meta import BaseSLearner, BaseTLearner, BaseRLearner, BaseXLearner
from causalml.inference.tree import UpliftRandomForestClassifier
from causalml.dataset import *
from causalml.metrics import *
import sklearn
from sklearn.ensemble import StackingRegressor,VotingRegressor,VotingClassifier,RandomForestRegressor
from sklearn import svm
import xgboost as xgb
import random
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from causalml.propensity import GradientBoostedPropensityModel 
from causalml.feature_selection.filters import FilterSelect
np.random.seed(42)

In [None]:
# 准备数据集，将特征和标签分开
data = pd.read_csv('./data/-999_train.csv')  # 读取数据集
data = data.iloc[:,1:]

X = data.drop(['y','treatment'],axis=1) # 特征列
y = data['y']  # 标签列
treatment = data['treatment']  # 处理组标识
feature_names = X.head(0)

print('特征、标签、干预的大小为：',X.shape,y.shape,treatment.shape)
# 将数据集分为训练集和测试集
X_train, X_test, y_train, y_test, t_train, t_test = train_test_split(X, y, treatment, test_size=0.1,random_state=42)

In [None]:
############使用随机森林特征重要性
# 创建随机森林分类器
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
# 使用随机森林分类器拟合数据
rf.fit(X_train, y_train)
# 获取特征重要性
importance = rf.feature_importances_
print(importance)
# 创建选择器，并基于特征重要性进行特征选择
selector = SelectFromModel(rf, threshold=1e-02)  # 设置重要性阈值
selector.fit(X_train, y_train)

# 获取选择后的特征矩阵
X_selected = selector.transform(X)

# 获取选择后的特征列名
selected_features = X.columns[selector.get_support()]  # 使用columns获取列名
print("选择后的特征矩阵:")
print(X_selected)
print("选择后的特征列名:")
print(selected_features)


In [None]:
##########使用RLearner自身特征重要性
##使用集成模型
def make_models(nums,max_depth_list,n_estimators_list,learning_rate_list,min_child_weight,spw ):
    models=[]
    for i in range(nums):
        models.append((str(i),xgb.XGBRegressor(max_depth=max_depth_list,learning_rate=0.06,gamma=0, min_child_weight = min_child_weight ,reg_alpha=0, # noqa: E501
         n_estimators=n_estimators_list ,scale_pos_weight=spw)))
    return models 

model1=make_models(3,200,200,0.06,spw=0.8,min_child_weight=0.5)  
model2=make_models(3,260,125,0.06,spw=0,min_child_weight=4)
stacking_model1 = StackingRegressor(estimators=model1,cv=5, n_jobs=5)
stacking_model2 = StackingRegressor(estimators=model2,cv=5, n_jobs=5)


rlearner = BaseRLearner(learner=None,
        outcome_learner=stacking_model1,
        effect_learner=stacking_model2,
        ate_alpha=3,  
        n_fold=5,
        random_state=42)




In [None]:
rlearner.fit(X=X_train, treatment=t_train, y=y_train)

In [None]:
r_tau = rlearner.predict(X=X_train)
model_tau_feature = RandomForestRegressor() 
# specify model for model_tau_feature
importance =  rlearner.get_importance(X=X_train, tau=r_tau, model_tau_feature=model_tau_feature,
                        normalize=True, method='auto',random_state=42)
pd.set_option('display.max_rows', 200)
print(importance)

In [None]:
# Using the feature_importances_ method in the base learner (LGBMRegressor() in this example)
rlearner.plot_importance(X=X_train, tau=r_tau, normalize=True, method='auto')

In [None]:
###########使用SHAP指标获得特征重要性
shap_rlearner = rlearner.get_shap_values(X=X_train, tau=r_tau)

# 使用这些列名绘制 SHAP 图
rlearner.plot_shap_values(X=X_train, tau=r_tau,max_display=161)



In [None]:
# interaction_idx set to 'auto' (searches for feature with greatest approximate interaction)
rlearner.plot_shap_dependence(treatment_group=1,
                              feature_idx=1,
                              X=X_train,
                              tau=r_tau,
                              interaction_idx='auto')

In [None]:
###########使用Filter方式获得特征重要性
from causalml.metrics import *
from causalml.feature_selection.filters import FilterSelect
filter_method = FilterSelect()
data = pd.read_csv('./data/-999_train.csv')  # 读取数据集
data = data.iloc[:,1:]

X = data.drop(['y','treatment'],axis=1) # 特征列
y = data['y']  # 标签列
treatment = data['treatment']  # 处理组标识
feature_names = X.head(0)
# F Filter with order 1
method = 'ED'
f_imp = filter_method.get_importance(data, feature_names, 'y', method, 
                                 experiment_group_column="treatment",control_group=0,
                                 treatment_group=1,
                                 n_bins=5
                                 ,order=2)


print(f_imp.iloc[0:10,1])

linear_importance = f_imp.iloc[:,1]