# SHAP Values for XGB With All Age Data

In [1]:
%pwd

'/mnt/d/OneDrive - Kyushu University/ESG09_Article/Code'

In [2]:
%cd ..

/mnt/d/OneDrive - Kyushu University/ESG09_Article


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


## Import Package

In [3]:
from joblib import dump, load
import os 
import pandas as pd
import shap
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


## SHAP 

### load data and hyperparameter

In [4]:
class RandomRunNFoldsKFold(KFold):
    def __init__(self, n_splits=10, random_state=None, run_splits=3, **kwargs):
        super().__init__(n_splits=n_splits, shuffle=True, random_state=random_state, **kwargs)
        self.random_state = random_state
        self.actual_splits = run_splits  # Number of actual splits to use

    def split(self, X, y=None, groups=None):
        folds = list(super().split(X, y, groups))
        if self.random_state is not None:
            random.seed(self.random_state)
        selected_folds = random.sample(folds, self.actual_splits)
        for train_index, test_index in selected_folds:
            yield train_index, test_index

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.actual_splits

In [5]:
bayes_search = load('Results/BayesSearchAgeTot20iter.joblib')

In [6]:
Df_Filename = os.path.join("Data", "GallupWB_Ml64var1911k14wave_v1.parquet")

In [7]:
Df = pd.read_parquet(Df_Filename)

In [8]:
#Df['COUNTRY_ISO3'] = Df['COUNTRY_ISO3'].astype('category')
label_encoder = LabelEncoder()
Df['COUNTRY_ISO3'] = label_encoder.fit_transform(Df['COUNTRY_ISO3'])

In [9]:
Df['COUNTRY_ISO3'].describe()

count    1.911212e+06
mean     7.819511e+01
std      4.666500e+01
min      0.000000e+00
25%      3.800000e+01
50%      7.400000e+01
75%      1.180000e+02
max      1.630000e+02
Name: COUNTRY_ISO3, dtype: float64

In [10]:
Df = Df.sample(frac=1, random_state=42).reset_index(drop=True)

In [11]:
ytot = Df['Cantril_ladder']

In [12]:
Xtot = Df.drop(columns=['Cantril_ladder'])

### Shap Computation

In [13]:
reg_xgb = xgb.XGBRegressor(objective='reg:squarederror',  device = 'cuda', tree_method='hist', random_state=42, enable_categorical=True, **bayes_search.best_params_)

In [14]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)

In [16]:
scores = []
for train_index, test_index in kf.split(Xtot):
    X_train, X_test = Xtot.iloc[train_index, :], Xtot.iloc[test_index, :]
    y_train, y_test = ytot.iloc[train_index], ytot.iloc[test_index]
    
    # Train the model
    model = xgb.XGBRegressor(objective='reg:squarederror',  device = 'cuda', tree_method='hist', random_state=42, enable_categorical=True, **bayes_search.best_params_)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    print(f"r2 is {r2}")
    scores.append(r2)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




r2 is 0.37790809379608614
r2 is 0.37698745006383627


KeyboardInterrupt: 

In [55]:
scores = cross_val_score(reg_xgb, Xtot, ytot, cv=kf, scoring='r2')

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




KeyboardInterrupt: 

In [None]:
scores

In [14]:
Shap_df = []

i = 1
for train_index, test_index in kf.split(Xtot):
    print(f"This is {i} fold")
    X_train, X_test = Xtot.iloc[train_index, :], Xtot.iloc[test_index, :]
    y_train, y_test = ytot.iloc[train_index], ytot.iloc[test_index]
    
    # Train the model
    model = xgb.XGBRegressor(objective='reg:squarederror',  device = 'cuda', tree_method='hist', random_state=42, enable_categorical=True, **bayes_search.best_params_)
    model.fit(X_train, y_train)

    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)

    colnames = X_test.columns + '_shap'
    X_test_shap= pd.DataFrame(shap_values, columns=colnames, index=X_test.index).head()
    X_test_merge = pd.concat([y_test, X_test, X_test_shap], axis=1)

    Shap_df.append(X_test_merge)

    i = i + 1

    dump(Shap_df, "Results/Shap_TotalDataset.joblib")



XGBoostError: [10:38:43] /workspace/src/tree/tree_model.cc:899: Check failed: !HasCategoricalSplit(): Please use JSON/UBJSON for saving models with categorical splits.
Stack trace:
  [bt] (0) /home/linux/anaconda3/envs/automl/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x64aa0a) [0x7fc8f8d60a0a]
  [bt] (1) /home/linux/anaconda3/envs/automl/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x64b70e) [0x7fc8f8d6170e]
  [bt] (2) /home/linux/anaconda3/envs/automl/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x48220e) [0x7fc8f8b9820e]
  [bt] (3) /home/linux/anaconda3/envs/automl/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x4bdaea) [0x7fc8f8bd3aea]
  [bt] (4) /home/linux/anaconda3/envs/automl/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(XGBoosterSaveModelToBuffer+0x490) [0x7fc8f887ae30]
  [bt] (5) /home/linux/anaconda3/envs/automl/lib/python3.9/lib-dynload/../../libffi.so.8(+0xa052) [0x7fca37aa5052]
  [bt] (6) /home/linux/anaconda3/envs/automl/lib/python3.9/lib-dynload/../../libffi.so.8(+0x8925) [0x7fca37aa3925]
  [bt] (7) /home/linux/anaconda3/envs/automl/lib/python3.9/lib-dynload/../../libffi.so.8(ffi_call+0xde) [0x7fca37aa406e]
  [bt] (8) /home/linux/anaconda3/envs/automl/lib/python3.9/lib-dynload/_ctypes.cpython-39-x86_64-linux-gnu.so(+0x91e0) [0x7fca37ab51e0]

