In [1]:
import sklearn
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np
import shap
import pandas as pd
import sys
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing, linear_model

sys.path.append('../LSP_Repo/')

import classifyFunctions


# Load my data
# Load Pickle
lightsheet_data = pd.read_pickle('lightsheet_data.pkl')

classifyDict = dict()
classifyDict['data'] = 'count'
classifyDict['label'] = 'drug'
classifyDict['model'] = 'LogRegL1'
classifyDict['model_featureSel'] = 'L1'
classifyDict['model_classify'] = 'L2'
classifyDict['shuffle'] = True
classifyDict['includeSAL'] = True
classifyDict['include6FDET'] = True
classifyDict['remove_high_corr'] = True
classifyDict['corrThreshold'] = 0.8
classifyDict['gridCV'] = False

_, _, featureNames, numYDict = classifyFunctions.reformatData(lightsheet_data, classifyDict)

In [5]:
# The standard SHAP procedure 
import re

# Load data
url = 'https://raw.githubusercontent.com/Sketchjar/MachineLearningHD/main/boston_data.csv'
# df = pd.read_csv(url); df.drop('Unnamed: 0',axis=1,inplace=True)
lightsheet_data_classify = lightsheet_data.pivot(index='dataset', columns='abbreviation', values='count')
lightsheet_data_classify = np.round(lightsheet_data_classify)

# Shuffle it to ensure that the sequence isn't informative
lightsheet_data_classify = lightsheet_data_classify.sample(frac=1)

# Not sure why splits are being called on the pd.df but I'll do it the same.
df = lightsheet_data_classify
y = pd.Series([re.sub(r'\d+$', '', string) for string in lightsheet_data_classify.index])
X = lightsheet_data_classify.reset_index(drop=True)

#Establish CV scheme
CV = KFold(n_splits=5, shuffle=True, random_state=10)

# Libraries for this section 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import shap

ix_training, ix_test = [], []
# Loop through each fold and append the training & test indices to the empty lists above
for fold in CV.split(df):
    ix_training.append(fold[0]), ix_test.append(fold[1])

SHAP_values_per_fold = [] #-#-#

np.random.seed(1) # Reproducibility 
CV_repeats = 10
# Make a list of random integers between 0 and 10000 of length = CV_repeats to act as different data splits
random_states = np.random.randint(10000, size=CV_repeats) 

######## Use a dict to track the SHAP values of each observation per CV repitition 

shap_values_per_cv = dict()
for sample in X.index:
    ## Create keys for each sample
    shap_values_per_cv[sample] = {} 
    ## Then, keys for each CV fold within each sample
    for CV_repeat in range(CV_repeats):
        shap_values_per_cv[sample][CV_repeat] = {}

# Split data, establish model, fit model, make prediction, score model, print result
for i, CV_repeat in enumerate(range(CV_repeats)): #-#-#
    #Verbose 
    print('\n------------ CV Repeat number:', CV_repeat)
    #Establish CV scheme
    CV = KFold(n_splits=5, shuffle=True, random_state=random_states[i]) # Set random state 

    ix_training, ix_test = [], []
    # Loop through each fold and append the training & test indices to the empty lists above
    for fold in CV.split(df):
        ix_training.append(fold[0]), ix_test.append(fold[1])
        
    ## Loop through each outer fold and extract SHAP values 
    for i, (train_outer_ix, test_outer_ix) in enumerate(zip(ix_training, ix_test)): 
        #Verbose
        print('\n------ Fold Number:',i)
        X_train, X_test = X.iloc[train_outer_ix, :], X.iloc[test_outer_ix, :]
        y_train, y_test = y.iloc[train_outer_ix], y.iloc[test_outer_ix]

        # model = RandomForestRegressor(random_state=10) # Random state for reproducibility (same results every time)
        # fit = model.fit(X_train, y_train)
        # yhat = fit.predict(X_test)
        # result = mean_squared_error(y_test, yhat)

        model = sklearn.linear_model.LogisticRegression(penalty="l2", C=0.1, solver="lbfgs") #multi_class="ovr"
        pipeline = make_pipeline(preprocessing.RobustScaler(), model)
        pipelineT = make_pipeline(preprocessing.RobustScaler())
        pipelineT.fit(X_train, y_train); pipeline.fit(X_train, y_train)
        X_train_trans = pipelineT.transform(X_train)

        fit = pipeline.fit(X_train_trans, y_train)
        yhat = fit.predict(X_test)
        # result = mean_squared_error(y_test, yhat)
        # print('RMSE:',round(np.sqrt(result),4))

        # Use SHAP to explain predictions
        # explainer = shap.Explainer(pipeline._final_estimator)
        explainer = shap.Explainer(pipeline._final_estimator, X_train_trans, feature_names=X.columns, max_iter=1000)
        shap_values = explainer.shap_values(X_train_trans)

        # Extract SHAP information per fold per sample 
        for i, test_index in enumerate(test_outer_ix):
            shap_values_per_cv[test_index][CV_repeat] = shap_values[i] #-#-#

# Establish lists to keep average Shap values, their Stds, and their min and max
average_shap_values, stds, ranges = [],[],[]

for i in range(0,len(df)):
    df_per_obs = pd.DataFrame.from_dict(shap_values_per_cv[i]) # Get all SHAP values for sample number i
    # Get relevant statistics for every sample 
    average_shap_values.append(df_per_obs.mean(axis=1).values) 
    stds.append(df_per_obs.std(axis=1).values)
    ranges.append(df_per_obs.max(axis=1).values-df_per_obs.min(axis=1).values)

new_index = [ix for ix_test_fold in ix_test for ix in ix_test_fold]

shap.summary_plot(np.array(average_shap_values), X, show = False)
plt.title('Average SHAP values after 10x cross-validation')



------------ CV Repeat number: 0

------ Fold Number: 0


In [None]:
# The standard SHAP procedure 
import re

# Load data
url = 'https://raw.githubusercontent.com/Sketchjar/MachineLearningHD/main/boston_data.csv'
# df = pd.read_csv(url); df.drop('Unnamed: 0',axis=1,inplace=True)
lightsheet_data_classify = reshaped_data = lightsheet_data.pivot(index='dataset', columns='abbreviation', values='count')
y = [re.sub(r'\d+$', '', string) for string in lightsheet_data_classify.index]
X = lightsheet_data_classify.reset_index(drop=True)


print(new_list)


#Establish CV scheme
CV = KFold(n_splits=5, shuffle=True, random_state=10)

# Libraries for this section 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import shap

ix_training, ix_test = [], []
# Loop through each fold and append the training & test indices to the empty lists above
for fold in CV.split(df):
    ix_training.append(fold[0]), ix_test.append(fold[1])

SHAP_values_per_fold = [] #-#-#

np.random.seed(1) # Reproducibility 
CV_repeats = 10
# Make a list of random integers between 0 and 10000 of length = CV_repeats to act as different data splits
random_states = np.random.randint(10000, size=CV_repeats) 

######## Use a dict to track the SHAP values of each observation per CV repitition 

shap_values_per_cv = dict()
for sample in X.index:
    ## Create keys for each sample
    shap_values_per_cv[sample] = {} 
    ## Then, keys for each CV fold within each sample
    for CV_repeat in range(CV_repeats):
        shap_values_per_cv[sample][CV_repeat] = {}

# Split data, establish model, fit model, make prediction, score model, print result
for i, CV_repeat in enumerate(range(CV_repeats)): #-#-#
    #Verbose 
    print('\n------------ CV Repeat number:', CV_repeat)
    #Establish CV scheme
    CV = KFold(n_splits=5, shuffle=True, random_state=random_states[i]) # Set random state 

    ix_training, ix_test = [], []
    # Loop through each fold and append the training & test indices to the empty lists above
    for fold in CV.split(df):
        ix_training.append(fold[0]), ix_test.append(fold[1])
        
    ## Loop through each outer fold and extract SHAP values 
    for i, (train_outer_ix, test_outer_ix) in enumerate(zip(ix_training, ix_test)): 
        #Verbose
        print('\n------ Fold Number:',i)
        X_train, X_test = X.iloc[train_outer_ix, :], X.iloc[test_outer_ix, :]
        y_train, y_test = y.iloc[train_outer_ix], y.iloc[test_outer_ix]

        model = RandomForestRegressor(random_state=10) # Random state for reproducibility (same results every time)
        fit = model.fit(X_train, y_train)
        yhat = fit.predict(X_test)
        result = mean_squared_error(y_test, yhat)
        print('RMSE:',round(np.sqrt(result),4))

        # Use SHAP to explain predictions
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_test)

        # Extract SHAP information per fold per sample 
        for i, test_index in enumerate(test_outer_ix):
            shap_values_per_cv[test_index][CV_repeat] = shap_values[i] #-#-#

# Establish lists to keep average Shap values, their Stds, and their min and max
average_shap_values, stds, ranges = [],[],[]

for i in range(0,len(df)):
    df_per_obs = pd.DataFrame.from_dict(shap_values_per_cv[i]) # Get all SHAP values for sample number i
    # Get relevant statistics for every sample 
    average_shap_values.append(df_per_obs.mean(axis=1).values) 
    stds.append(df_per_obs.std(axis=1).values)
    ranges.append(df_per_obs.max(axis=1).values-df_per_obs.min(axis=1).values)

new_index = [ix for ix_test_fold in ix_test for ix in ix_test_fold]

shap.summary_plot(np.array(average_shap_values), X, show = False)
plt.title('Average SHAP values after 10x cross-validation')


In [None]:
# The standard SHAP procedure 
import re

# Load data
url = 'https://raw.githubusercontent.com/Sketchjar/MachineLearningHD/main/boston_data.csv'
# df = pd.read_csv(url); df.drop('Unnamed: 0',axis=1,inplace=True)
lightsheet_data_classify = reshaped_data = lightsheet_data.pivot(index='dataset', columns='abbreviation', values='count')
y = [re.sub(r'\d+$', '', string) for string in lightsheet_data_classify.index]
X = lightsheet_data_classify.reset_index(drop=True)


print(new_list)


#Establish CV scheme
CV = KFold(n_splits=5, shuffle=True, random_state=10)

# Libraries for this section 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import shap

ix_training, ix_test = [], []
# Loop through each fold and append the training & test indices to the empty lists above
for fold in CV.split(df):
    ix_training.append(fold[0]), ix_test.append(fold[1])

SHAP_values_per_fold = [] #-#-#

np.random.seed(1) # Reproducibility 
CV_repeats = 10
# Make a list of random integers between 0 and 10000 of length = CV_repeats to act as different data splits
random_states = np.random.randint(10000, size=CV_repeats) 

######## Use a dict to track the SHAP values of each observation per CV repitition 

shap_values_per_cv = dict()
for sample in X.index:
    ## Create keys for each sample
    shap_values_per_cv[sample] = {} 
    ## Then, keys for each CV fold within each sample
    for CV_repeat in range(CV_repeats):
        shap_values_per_cv[sample][CV_repeat] = {}

# Split data, establish model, fit model, make prediction, score model, print result
for i, CV_repeat in enumerate(range(CV_repeats)): #-#-#
    #Verbose 
    print('\n------------ CV Repeat number:', CV_repeat)
    #Establish CV scheme
    CV = KFold(n_splits=5, shuffle=True, random_state=random_states[i]) # Set random state 

    ix_training, ix_test = [], []
    # Loop through each fold and append the training & test indices to the empty lists above
    for fold in CV.split(df):
        ix_training.append(fold[0]), ix_test.append(fold[1])
        
    ## Loop through each outer fold and extract SHAP values 
    for i, (train_outer_ix, test_outer_ix) in enumerate(zip(ix_training, ix_test)): 
        #Verbose
        print('\n------ Fold Number:',i)
        X_train, X_test = X.iloc[train_outer_ix, :], X.iloc[test_outer_ix, :]
        y_train, y_test = y.iloc[train_outer_ix], y.iloc[test_outer_ix]

        model = RandomForestRegressor(random_state=10) # Random state for reproducibility (same results every time)
        fit = model.fit(X_train, y_train)
        yhat = fit.predict(X_test)
        result = mean_squared_error(y_test, yhat)
        print('RMSE:',round(np.sqrt(result),4))

        # Use SHAP to explain predictions
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_test)

        # Extract SHAP information per fold per sample 
        for i, test_index in enumerate(test_outer_ix):
            shap_values_per_cv[test_index][CV_repeat] = shap_values[i] #-#-#

# Establish lists to keep average Shap values, their Stds, and their min and max
average_shap_values, stds, ranges = [],[],[]

for i in range(0,len(df)):
    df_per_obs = pd.DataFrame.from_dict(shap_values_per_cv[i]) # Get all SHAP values for sample number i
    # Get relevant statistics for every sample 
    average_shap_values.append(df_per_obs.mean(axis=1).values) 
    stds.append(df_per_obs.std(axis=1).values)
    ranges.append(df_per_obs.max(axis=1).values-df_per_obs.min(axis=1).values)

new_index = [ix for ix_test_fold in ix_test for ix in ix_test_fold]

shap.summary_plot(np.array(average_shap_values), X, show = False)
plt.title('Average SHAP values after 10x cross-validation')
