# SHAP Values for XGB To Analyze Treatment Effect between Gender

In [9]:
%pwd

'/home/GPU/GallupWellBeingResearch'

In [2]:
%cd ..

/home/GPU/GallupWellBeingResearch


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


## Import Package

In [3]:
import matplotlib.pyplot as plt
from joblib import dump, load
import os 
import pandas as pd
#import shap
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

In [4]:
xgb.__version__

'2.0.3'

In [5]:
from cuml.explainer import KernelExplainer

In [6]:
import cudf

In [7]:
import warnings

warnings.filterwarnings('ignore', category=UserWarning)

## SHAP 

### load data and hyperparameter

In [8]:
hyper_dict = {'gamma': 0.01230893738822448,
 'learning_rate': 0.03929984746983189,
 'max_delta_step': 0.954107364480465,
 'max_depth': 29,
 'min_child_weight': 10.0,
 'n_estimators': 5000,
 'reg_alpha': 2.8833022521389937,
 'reg_lambda': 0.0016481010416658545,
 'subsample': 0.6431757974530777}

In [9]:
Df_Filename = os.path.join("Data", "GallupWB_Ml64var1911k14waveCounOneHot_v1.parquet")

In [10]:
Df_countryOneHot = pd.read_parquet(Df_Filename)

In [11]:
y = Df_countryOneHot['TreatmentEffectFemMal']
X = Df_countryOneHot.drop(columns=['Cantril_ladder', 'MaleReal_y', 'TreatedAsMale', 'FemaleReal_y', 'TreatedAsFemale',
                                   'TreatmentEffectFemMal', 'COUNTRY_ISO3', 'Gender_female'])

### Shap Computation

In [16]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)

In [20]:
Shap_df = []

i = 1
for train_index, test_index in kf.split(Xtot):
    print(f"This is {i} fold")
    X_train, X_test = Xtot.iloc[train_index, :], Xtot.iloc[test_index, :]
    y_train, y_test = ytot.iloc[train_index], ytot.iloc[test_index]
    print(X_test.iloc[:100,:].shape)

    cX_train = cudf.from_pandas(X_train)
    cX_test = cudf.from_pandas(X_test)
    cy_train = cudf.from_pandas(y_train)
    cy_test = cudf.from_pandas(y_test)
    
    # Train the model
    model = xgb.XGBRegressor(objective='reg:squarederror', tree_method='hist', random_state=42, device = 'cuda', **hyper_dict)
    model.fit(cX_train, cy_train)
    
    print('model done')

    colnames = X_test.columns + '_shap'

    background = cudf.from_pandas(X_train.iloc[:100,:])
    cu_explainer = KernelExplainer(model=model.predict,
                                   data=background,
                                   is_gpu_model=True, random_state=42)
    
    interval = 1000
    for row in list(range(0, X_test.shape[0], interval)):
        print(f'here{row}:{row + interval}')
        %time cu_shap_value_merge = cu_explainer.shap_values(X_test.iloc[row:row + interval,:])
        X_test_shap = pd.DataFrame(cu_shap_value_merge, columns=colnames, index=X_test.index[row:row + interval])
        X_test_merge = pd.concat([y_test.iloc[row:row + interval], X_test.iloc[row:row + interval,:], X_test_shap], axis=1)

        X_test_merge.to_parquet(f'Results/ShapAll/X_test_{row}_{row + interval}.parquet')
        Shap_df.append(X_test_merge)

    dump(Shap_df, "Results/Shap_GenderTreamentDifference.joblib")
    break

This is 1 fold
(100, 63)


  feature_names = data.columns.format()


model done
here0:1000
CPU times: user 23min 31s, sys: 13.3 s, total: 23min 44s
Wall time: 8min 28s
here1000:2000
CPU times: user 23min 31s, sys: 13.5 s, total: 23min 45s
Wall time: 8min 28s
here2000:3000
CPU times: user 23min 31s, sys: 13.2 s, total: 23min 44s
Wall time: 8min 28s
here3000:4000
CPU times: user 25min 45s, sys: 13.4 s, total: 25min 58s
Wall time: 9min 45s
here4000:5000
CPU times: user 26min 38s, sys: 13.5 s, total: 26min 51s
Wall time: 10min 16s
here5000:6000
CPU times: user 26min 40s, sys: 13.5 s, total: 26min 54s
Wall time: 10min 16s
here6000:7000
CPU times: user 26min 38s, sys: 13.7 s, total: 26min 51s
Wall time: 10min 15s
here7000:8000
CPU times: user 26min 40s, sys: 13.5 s, total: 26min 53s
Wall time: 10min 16s
here8000:9000
CPU times: user 26min 38s, sys: 13.6 s, total: 26min 52s
Wall time: 10min 15s
here9000:10000
CPU times: user 26min 35s, sys: 13.5 s, total: 26min 48s
Wall time: 10min 15s
here10000:11000
CPU times: user 26min 42s, sys: 13.4 s, total: 26min 55s
Wa

In [19]:
Concat_Shap = pd.concat(Shap_df, axis=0)

In [20]:
Concat_Shap.shape

(1911212, 127)

In [22]:
Concat_Shap.columns

Index(['Cantril_ladder', 'wave', 'INCOME_2', 'Health_disable', 'Relative_have',
       'Living_standard_change', 'Enough_food', 'Enough_shelter',
       'Well_rested', 'Respected',
       ...
       'Corruption_government_shap', 'Performance_leadership_shap',
       'Gender_female_shap', 'Age_shap', 'Marital_status_shap',
       'Employment_shap', 'Children_under15_shap', 'Feeling_income_shap',
       'Income_level_shap', 'COUNTRY_ISO3_shap'],
      dtype='object', length=127)