In [None]:
import pandas as pd
import xgboost as xgb
import numpy as np
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTENC
import matplotlib.pyplot as plt
from PIL import Image
from scipy.interpolate import BSpline, make_interp_spline, interp1d
#import rpy2.robjects as robjects
#from rpy2.robjects.packages import importr
import csv
from dfply import *
from xgboost import XGBClassifier
import itertools
import os
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import time
import pickle
from glob import glob

import plot_utils
import utils_function

In [None]:
def collect_collectSHAPraw_cross_sub(configs_variables):   
    
    if not configs_variables[0]['rerun_flag'] and os.path.exists(datafolder+'/shapalltmp.parquet'):
        print('Existed: shapalltmp.parquet')
        return

    shap_data_raws = list()
    for configs_variable_m in configs_variables:
         for configs_variable_d in configs_variables:
            datafolder = configs_variable_m['datafolder']
            stg = configs_variable_m['stg']
            fs = configs_variable_m['fs']
            oversample = configs_variable_m['oversample']
            model_type = configs_variable_m['model_type']   

            drop_correlation_catboost = configs_variable_m['drop_correlation_catboost']
            if drop_correlation_catboost:
                suffix = 'nc'
            else:
                suffix = ''            
            tmpdf = pd.read_parquet(datafolder+site_m+'/shapdataraw1d_'+model_type+'_'+site_m+'_'+site_d+'_'+str(year)+'_'+stg+'_'+fs+'_'+oversample+suffix+'.parquet')
            tmpdf['site_m'] = 'site_m'
            tmpdf['site_d'] = 'site_d'
            shap_data_raws.append(tmpdf)

    shap_data_raws = pd.concat(shap_data_raws)
    shap_data_raws.to_parquet(datafolder+'/shapdataraw1d.parquet')            
    
    shap_data_raws = list()
    for configs_variable_m in configs_variables:
         for configs_variable_d in configs_variables:
            datafolder = configs_variable_m['datafolder']
            stg = configs_variable_m['stg']
            fs = configs_variable_m['fs']
            oversample = configs_variable_m['oversample']
            model_type = configs_variable_m['model_type']   

            drop_correlation_catboost = configs_variable_m['drop_correlation_catboost']
            if drop_correlation_catboost:
                suffix = 'nc'
            else:
                suffix = ''            
            tmpdf = pd.read_parquet(datafolder+site_m+'/shapdataraw2d_'+model_type+'_'+site_m+'_'+site_d+'_'+str(year)+'_'+stg+'_'+fs+'_'+oversample+suffix+'.parquet')
            tmpdf['site_m'] = 'site_m'
            tmpdf['site_d'] = 'site_d'
            shap_data_raws.append(tmpdf)

    shap_data_raws = pd.concat(shap_data_raws)
    shap_data_raws.to_parquet(datafolder+'/shapdataraw2d.parquet')    

In [None]:
def collect_collectSHAPraw_cross_sub_pre(configs_variables, top0=30):
    
    # get top features
    df_importances, df_importances_stat = plot_utils.get_importances_features_stat(configs_variables)
    df = df_importances.sort_values('rank', ascending=False).reset_index().groupby('site').head(top0)
    top3030 = df[['site', 'Feature Id']].groupby('Feature Id').count().sort_values('site',ascending=False).head(top0)  
    top3030 =top3030.index

    shap_finals = list()

    for configs_variable_m in configs_variables:
        for configs_variable_d in configs_variables:
            # read datas
            year=3000
            site_m, datafolder, home_directory = utils_function.get_commons(configs_variable_m)
            site_d, datafolder, home_directory = utils_function.get_commons(configs_variable_d)

            datafolder = configs_variable_m['datafolder']
            stg = configs_variable_m['stg']
            fs = configs_variable_m['fs']
            oversample = configs_variable_m['oversample']
            model_type = configs_variable_m['model_type']   

            drop_correlation_catboost = configs_variable_m['drop_correlation_catboost']
            if drop_correlation_catboost:
                suffix = 'nc'
            else:
                suffix = ''     

            if not configs_variable_m['rerun_flag'] and os.path.exists(datafolder+'/shapalltmp.parquet'):
                print('Existed: shapalltmp.parquet')

            print('Running collectSHAPraw_cross_sub '+model_type+' on site '+site_m+'/'+site_d+":"+str(year)+":"+stg+":"+fs+":"+oversample, flush = True)
            tic = time.perf_counter()     

            try:
                columc_df = pd.read_pickle(datafolder+site_m+'/X_train_'+site_m+'_'+str(year)+'_'+stg+'_'+fs+'_'+oversample+suffix+'.pkl')
                feature_exists = list(set(columc_df.columns) & set(top3030))
                
                shapX = pd.read_parquet(datafolder+site_m+'/shapdatarawX_'+model_type+'_'+site_m+'_'+site_d+'_'+str(year)+'_'+stg+'_'+fs+'_'+oversample+suffix+'.parquet', columns=feature_exists)
                shap = pd.read_parquet(datafolder+site_m+'/shapdataraw_'+model_type+'_'+site_m+'_'+site_d+'_'+str(year)+'_'+stg+'_'+fs+'_'+oversample+suffix+'.parquet', columns=feature_exists)            
                
                shapX = shapX[feature_exists]
                shap = shap[feature_exists]

#                 # Reset index to convert the index to a column
#                 shapX_reset = shapX.reset_index()
#                 shapX_long = pd.melt(shapX_reset, id_vars=['index'], var_name='feature', value_name='value')
#                 shapX_long = shapX_long.rename(columns={'index': 'ID'})
#                 shapX_long.columns = ['ID', 'feature', 'Name']

#                 # Reset index to convert the index to a column
#                 shap_reset = shap.reset_index()
#                 shap_long = pd.melt(shap_reset, id_vars=['index'], var_name='feature', value_name='value')
#                 shap_long = shap_long.rename(columns={'index': 'ID'})

#                 shap_final = shap_long.merge(shapX_long, on = ['ID', 'feature'], how='inner')
                
                shapX.columns = shapX.columns+'_Names'
                shap.columns = shap.columns+'_vals'
                shap_final = pd.concat([shapX, shap],axis=1)    
    
                shap_final['site_m'] = site_m
                shap_final['site_d'] = site_d

                shap_finals.append(shap_final)
            except:
                pass
        
    shap_finalX = pd.concat(shap_finals)
    shap_finalX.to_parquet(datafolder+'/shapalltmp.parquet')