# SHAP Values for XGB To Analyze Treatment Effect between Gender

In [1]:
%pwd

'/home/GPU/ML_GallupWellBeingResearch/Code'

In [2]:
%cd ..

/home/GPU/ML_GallupWellBeingResearch


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


## Import Package

In [3]:
import matplotlib.pyplot as plt
from joblib import dump, load
import os 
import pandas as pd
#import shap
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

In [4]:
xgb.__version__

'2.0.3'

In [5]:
from cuml.explainer import KernelExplainer

In [6]:
import cudf

In [7]:
import warnings

warnings.filterwarnings('ignore', category=UserWarning)

## SHAP 

### load data and hyperparameter

In [8]:
hyper_dict = {'gamma': 0.01230893738822448,
 'learning_rate': 0.03929984746983189,
 'max_delta_step': 0.954107364480465,
 'max_depth': 29,
 'min_child_weight': 10.0,
 'n_estimators': 5000,
 'reg_alpha': 2.8833022521389937,
 'reg_lambda': 0.0016481010416658545,
 'subsample': 0.6431757974530777}

In [31]:
Df_Filename = os.path.join("Data", "GallupWB_Ml64var1911k14waveCounOneHot_v1.parquet")

In [32]:
Df_countryOneHot = pd.read_parquet(Df_Filename)

In [33]:
Df_countryOneHot.columns[:69]

Index(['MaleReal_y', 'TreatedAsMale', 'FemaleReal_y', 'TreatedAsFemale',
       'TreatmentEffectFemMal', 'wave', 'INCOME_2', 'Cantril_ladder',
       'Health_disable', 'Relative_have', 'Living_standard_change',
       'Enough_food', 'Enough_shelter', 'Well_rested', 'Respected', 'Smile',
       'Interesting_thing', 'Enjoyment', 'Physical_pain', 'Worry', 'Sadness',
       'Stress', 'Anger', 'City_satisficied', 'Economic_change',
       'Goodtime_job', 'Sat_pubtran', 'Sat_road', 'Sat_edu', 'Sat_qualityair',
       'Sat_qualitywater', 'Sat_healthcare', 'Sat_affhouse', 'Sat_oppofriend',
       'Good_minorities', 'Good_gayles', 'Good_immigrants', 'Donated',
       'Volunteer', 'Help_stranger', 'Voice_official', 'Local_police',
       'Safety_walk', 'Stolen', 'Assualted', 'Religion_importance',
       'Children_respected', 'Children_learn', 'Women_respected',
       'Sat_dealpoor', 'Sat_perserveenv', 'Freedom_chooselife',
       'Conf_military', 'Conf_judicial', 'Conf_government', 'Conf_finan

In [36]:
boolean_columns = Df_countryOneHot.select_dtypes(include=['bool']).columns
print(boolean_columns)

Index(['Country_AFG', 'Country_AGO', 'Country_ALB', 'Country_ARE',
       'Country_ARG', 'Country_ARM', 'Country_AUS', 'Country_AUT',
       'Country_AZE', 'Country_BDI',
       ...
       'Country_VEN', 'Country_VNM', 'Country_XKX', 'Country_XNC',
       'Country_XNK', 'Country_XSR', 'Country_YEM', 'Country_ZAF',
       'Country_ZMB', 'Country_ZWE'],
      dtype='object', length=164)


In [37]:
Df_countryOneHot[boolean_columns] = Df_countryOneHot[boolean_columns].astype(int)

In [38]:
Df_countryOneHot.head()

Unnamed: 0,MaleReal_y,TreatedAsMale,FemaleReal_y,TreatedAsFemale,TreatmentEffectFemMal,wave,INCOME_2,Cantril_ladder,Health_disable,Relative_have,...,Country_VEN,Country_VNM,Country_XKX,Country_XNC,Country_XNK,Country_XSR,Country_YEM,Country_ZAF,Country_ZMB,Country_ZWE
1751065,7.0,4.768608,7.0,5.575863,0.807255,12,1520.266978,7.0,2.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1120113,4.0,3.683495,4.0,3.977802,0.294307,8,2617.098318,4.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1295575,4.0,5.647912,4.0,5.684599,0.036688,9,4074.798797,4.0,2.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2247597,5.0,2.145522,5.0,2.162362,0.01684,15,3090.23369,5.0,2.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2257066,7.0,7.555087,7.0,7.139609,-0.415478,15,26804.971377,7.0,2.0,1.0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
ytot = Df_countryOneHot['TreatmentEffectFemMal']
Xtot = Df_countryOneHot.drop(columns=['Cantril_ladder', 'MaleReal_y', 'TreatedAsMale', 'FemaleReal_y', 'TreatedAsFemale',
                                   'TreatmentEffectFemMal', 'COUNTRY_ISO3', 'Gender_female'])

In [40]:
Xtot.head()

Unnamed: 0,wave,INCOME_2,Health_disable,Relative_have,Living_standard_change,Enough_food,Enough_shelter,Well_rested,Respected,Smile,...,Country_VEN,Country_VNM,Country_XKX,Country_XNC,Country_XNK,Country_XSR,Country_YEM,Country_ZAF,Country_ZMB,Country_ZWE
1751065,12,1520.266978,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1120113,8,2617.098318,1.0,1.0,-0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1295575,9,4074.798797,2.0,0.0,-1.0,0.0,0.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
2247597,15,3090.23369,2.0,0.0,-1.0,1.0,1.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
2257066,15,26804.971377,2.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0


### Shap Computation

In [41]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)

In [42]:
Shap_df = []

i = 1
for train_index, test_index in kf.split(Xtot):
    print(f"This is {i} fold")
    X_train, X_test = Xtot.iloc[train_index, :], Xtot.iloc[test_index, :]
    y_train, y_test = ytot.iloc[train_index], ytot.iloc[test_index]
    print(X_test.iloc[:100,:].shape)

    cX_train = cudf.from_pandas(X_train)
    cy_train = cudf.from_pandas(y_train)
    
    # Train the model
    model = xgb.XGBRegressor(objective='reg:squarederror', tree_method='hist', random_state=42, device = 'cuda', **hyper_dict)
    model.fit(cX_train, cy_train)
    
    print('model done')

    colnames = X_test.columns + '_shap'

    background = cudf.from_pandas(X_train.iloc[:100,:])
    cu_explainer = KernelExplainer(model=model.predict,
                                   data=background,
                                   is_gpu_model=True, random_state=42)
    
    interval = 1000
    for row in list(range(0, X_test.shape[0], interval)):
        print(f'here{row}:{row + interval}')
        %time cu_shap_value_merge = cu_explainer.shap_values(X_test.iloc[row:row + interval,:])
        X_test_shap = pd.DataFrame(cu_shap_value_merge, columns=colnames, index=X_test.index[row:row + interval])
        X_test_merge = pd.concat([y_test.iloc[row:row + interval], X_test.iloc[row:row + interval,:], X_test_shap], axis=1)

        X_test_merge.to_parquet(f'Results/ShapAll/X_test_{row}_{row + interval}.parquet')
        Shap_df.append(X_test_merge)

    dump(Shap_df, "Results/Shap_GenderTreamentDifference.joblib")
    break

This is 1 fold
(100, 225)


  feature_names = data.columns.format()


model done
here0:1000
CPU times: user 57min 18s, sys: 3min 53s, total: 1h 1min 12s
Wall time: 37min 46s
here1000:2000
CPU times: user 57min 13s, sys: 3min 52s, total: 1h 1min 6s
Wall time: 37min 46s
here2000:3000
CPU times: user 56min 59s, sys: 3min 53s, total: 1h 53s
Wall time: 37min 44s
here3000:4000
CPU times: user 56min 58s, sys: 3min 54s, total: 1h 52s
Wall time: 37min 41s
here4000:5000
CPU times: user 56min 31s, sys: 3min 54s, total: 1h 26s
Wall time: 37min 20s
here5000:6000
CPU times: user 56min 35s, sys: 3min 56s, total: 1h 31s
Wall time: 37min 20s
here6000:7000
CPU times: user 56min 27s, sys: 3min 55s, total: 1h 22s
Wall time: 37min 20s
here7000:8000
CPU times: user 56min 36s, sys: 3min 57s, total: 1h 33s
Wall time: 37min 20s
here8000:9000
CPU times: user 56min 35s, sys: 3min 56s, total: 1h 32s
Wall time: 37min 21s
here9000:10000
CPU times: user 57min 5s, sys: 3min 56s, total: 1h 1min 2s
Wall time: 37min 45s
here10000:11000
CPU times: user 57min 7s, sys: 3min 58s, total: 1h 1m

In [43]:
Concat_Shap = pd.concat(Shap_df, axis=0)

In [44]:
Concat_Shap.shape

(191122, 451)

In [45]:
Concat_Shap.columns

Index(['TreatmentEffectFemMal', 'wave', 'INCOME_2', 'Health_disable',
       'Relative_have', 'Living_standard_change', 'Enough_food',
       'Enough_shelter', 'Well_rested', 'Respected',
       ...
       'Country_VEN_shap', 'Country_VNM_shap', 'Country_XKX_shap',
       'Country_XNC_shap', 'Country_XNK_shap', 'Country_XSR_shap',
       'Country_YEM_shap', 'Country_ZAF_shap', 'Country_ZMB_shap',
       'Country_ZWE_shap'],
      dtype='object', length=451)

### Abs load data and hyperparameter

In [8]:
hyper_dict = {'gamma': 0.001,
 'learning_rate': 0.05276773961416362,
 'max_delta_step': 0.0857797704073729,
 'max_depth': 31,
 'min_child_weight': 2.36778985445932,
 'n_estimators': 2824,
 'reg_alpha': 1.2645694578168394,
 'reg_lambda': 0.06727808895217463,
 'subsample': 0.9172598597711468}

In [31]:
Df_Filename = os.path.join("Data", "GallupWB_Ml64var1911k14waveCounOneHot_v1.parquet")

In [32]:
Df_countryOneHot = pd.read_parquet(Df_Filename)

In [33]:
Df_countryOneHot.columns[:69]

Index(['MaleReal_y', 'TreatedAsMale', 'FemaleReal_y', 'TreatedAsFemale',
       'TreatmentEffectFemMal', 'wave', 'INCOME_2', 'Cantril_ladder',
       'Health_disable', 'Relative_have', 'Living_standard_change',
       'Enough_food', 'Enough_shelter', 'Well_rested', 'Respected', 'Smile',
       'Interesting_thing', 'Enjoyment', 'Physical_pain', 'Worry', 'Sadness',
       'Stress', 'Anger', 'City_satisficied', 'Economic_change',
       'Goodtime_job', 'Sat_pubtran', 'Sat_road', 'Sat_edu', 'Sat_qualityair',
       'Sat_qualitywater', 'Sat_healthcare', 'Sat_affhouse', 'Sat_oppofriend',
       'Good_minorities', 'Good_gayles', 'Good_immigrants', 'Donated',
       'Volunteer', 'Help_stranger', 'Voice_official', 'Local_police',
       'Safety_walk', 'Stolen', 'Assualted', 'Religion_importance',
       'Children_respected', 'Children_learn', 'Women_respected',
       'Sat_dealpoor', 'Sat_perserveenv', 'Freedom_chooselife',
       'Conf_military', 'Conf_judicial', 'Conf_government', 'Conf_finan

In [36]:
boolean_columns = Df_countryOneHot.select_dtypes(include=['bool']).columns
print(boolean_columns)

Index(['Country_AFG', 'Country_AGO', 'Country_ALB', 'Country_ARE',
       'Country_ARG', 'Country_ARM', 'Country_AUS', 'Country_AUT',
       'Country_AZE', 'Country_BDI',
       ...
       'Country_VEN', 'Country_VNM', 'Country_XKX', 'Country_XNC',
       'Country_XNK', 'Country_XSR', 'Country_YEM', 'Country_ZAF',
       'Country_ZMB', 'Country_ZWE'],
      dtype='object', length=164)


In [37]:
Df_countryOneHot[boolean_columns] = Df_countryOneHot[boolean_columns].astype(int)

In [38]:
Df_countryOneHot.head()

Unnamed: 0,MaleReal_y,TreatedAsMale,FemaleReal_y,TreatedAsFemale,TreatmentEffectFemMal,wave,INCOME_2,Cantril_ladder,Health_disable,Relative_have,...,Country_VEN,Country_VNM,Country_XKX,Country_XNC,Country_XNK,Country_XSR,Country_YEM,Country_ZAF,Country_ZMB,Country_ZWE
1751065,7.0,4.768608,7.0,5.575863,0.807255,12,1520.266978,7.0,2.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1120113,4.0,3.683495,4.0,3.977802,0.294307,8,2617.098318,4.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1295575,4.0,5.647912,4.0,5.684599,0.036688,9,4074.798797,4.0,2.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2247597,5.0,2.145522,5.0,2.162362,0.01684,15,3090.23369,5.0,2.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2257066,7.0,7.555087,7.0,7.139609,-0.415478,15,26804.971377,7.0,2.0,1.0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
ytot = Df_countryOneHot['TreatmentEffectFemMal']
Xtot = Df_countryOneHot.drop(columns=['Cantril_ladder', 'MaleReal_y', 'TreatedAsMale', 'FemaleReal_y', 'TreatedAsFemale',
                                   'TreatmentEffectFemMal', 'COUNTRY_ISO3', 'Gender_female'])

In [40]:
Xtot.head()

Unnamed: 0,wave,INCOME_2,Health_disable,Relative_have,Living_standard_change,Enough_food,Enough_shelter,Well_rested,Respected,Smile,...,Country_VEN,Country_VNM,Country_XKX,Country_XNC,Country_XNK,Country_XSR,Country_YEM,Country_ZAF,Country_ZMB,Country_ZWE
1751065,12,1520.266978,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1120113,8,2617.098318,1.0,1.0,-0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1295575,9,4074.798797,2.0,0.0,-1.0,0.0,0.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
2247597,15,3090.23369,2.0,0.0,-1.0,1.0,1.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
2257066,15,26804.971377,2.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0


### ABS Shap Computation

In [41]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)

In [42]:
Shap_df = []

i = 1
for train_index, test_index in kf.split(Xtot):
    print(f"This is {i} fold")
    X_train, X_test = Xtot.iloc[train_index, :], Xtot.iloc[test_index, :]
    y_train, y_test = ytot.iloc[train_index], ytot.iloc[test_index]
    print(X_test.iloc[:100,:].shape)

    cX_train = cudf.from_pandas(X_train)
    cy_train = cudf.from_pandas(y_train)
    
    # Train the model
    model = xgb.XGBRegressor(objective='reg:squarederror', tree_method='hist', random_state=42, device = 'cuda', **hyper_dict)
    model.fit(cX_train, cy_train)
    
    print('model done')

    colnames = X_test.columns + '_shap'

    background = cudf.from_pandas(X_train.iloc[:100,:])
    cu_explainer = KernelExplainer(model=model.predict,
                                   data=background,
                                   is_gpu_model=True, random_state=42)
    
    interval = 1000
    for row in list(range(0, X_test.shape[0], interval)):
        print(f'here{row}:{row + interval}')
        %time cu_shap_value_merge = cu_explainer.shap_values(X_test.iloc[row:row + interval,:])
        X_test_shap = pd.DataFrame(cu_shap_value_merge, columns=colnames, index=X_test.index[row:row + interval])
        X_test_merge = pd.concat([y_test.iloc[row:row + interval], X_test.iloc[row:row + interval,:], X_test_shap], axis=1)

        X_test_merge.to_parquet(f'Results/ShapAll/X_test_{row}_{row + interval}.parquet')
        Shap_df.append(X_test_merge)

    dump(Shap_df, "Results/Shap_GenderAbsTreamentDifference.joblib")
    break

This is 1 fold
(100, 225)


  feature_names = data.columns.format()


model done
here0:1000
CPU times: user 57min 18s, sys: 3min 53s, total: 1h 1min 12s
Wall time: 37min 46s
here1000:2000
CPU times: user 57min 13s, sys: 3min 52s, total: 1h 1min 6s
Wall time: 37min 46s
here2000:3000
CPU times: user 56min 59s, sys: 3min 53s, total: 1h 53s
Wall time: 37min 44s
here3000:4000
CPU times: user 56min 58s, sys: 3min 54s, total: 1h 52s
Wall time: 37min 41s
here4000:5000
CPU times: user 56min 31s, sys: 3min 54s, total: 1h 26s
Wall time: 37min 20s
here5000:6000
CPU times: user 56min 35s, sys: 3min 56s, total: 1h 31s
Wall time: 37min 20s
here6000:7000
CPU times: user 56min 27s, sys: 3min 55s, total: 1h 22s
Wall time: 37min 20s
here7000:8000
CPU times: user 56min 36s, sys: 3min 57s, total: 1h 33s
Wall time: 37min 20s
here8000:9000
CPU times: user 56min 35s, sys: 3min 56s, total: 1h 32s
Wall time: 37min 21s
here9000:10000
CPU times: user 57min 5s, sys: 3min 56s, total: 1h 1min 2s
Wall time: 37min 45s
here10000:11000
CPU times: user 57min 7s, sys: 3min 58s, total: 1h 1m

In [43]:
Concat_Shap = pd.concat(Shap_df, axis=0)

In [44]:
Concat_Shap.shape

(191122, 451)

In [45]:
Concat_Shap.columns

Index(['TreatmentEffectFemMal', 'wave', 'INCOME_2', 'Health_disable',
       'Relative_have', 'Living_standard_change', 'Enough_food',
       'Enough_shelter', 'Well_rested', 'Respected',
       ...
       'Country_VEN_shap', 'Country_VNM_shap', 'Country_XKX_shap',
       'Country_XNC_shap', 'Country_XNK_shap', 'Country_XSR_shap',
       'Country_YEM_shap', 'Country_ZAF_shap', 'Country_ZMB_shap',
       'Country_ZWE_shap'],
      dtype='object', length=451)