In [None]:
# === Data handling ===
import pickle
import numpy as np
import pandas as pd

# === Impute and scale ===
from sklearn.impute import SimpleImputer
from sklearn import preprocessing as pp

# === Dimensionality reduction ===
from sklearn.decomposition import PCA
import seaborn as sns

# === Explainable AI ===
import shap
from xgboost import DMatrix, train
import matplotlib.pyplot as plt

In [None]:
# === Set project ===
proj = 'LSI'
proj = 'RS'

# === Read Xy ===
X_pd = pd.read_csv('temp_out/'+proj+'_X.csv', index_col=0)
X = X_pd.to_numpy()
y = np.genfromtxt('temp_out/'+proj+'_y.csv', delimiter=',')

In [None]:
# === Impute missing values ===
imp_med = SimpleImputer(missing_values=np.nan, strategy='median')
Xi = imp_med.fit_transform(X)

# === Filter data and scale ===
Xs_pd = X_pd.dropna(axis=1)
Xs_pd = Xs_pd.loc[:, Xs_pd.var()!=0]
Xs = X[:,~np.any(np.isnan(X), axis=0)]
Xs = Xs[:, np.var(Xs, axis=0) != 0]
Xs = pp.MinMaxScaler().fit_transform(Xs)
sh1 = np.shape(Xs)
print(f'Shape | filtered/scaled: {sh1}')

# === Scaling imputed features ===
Xi = Xi[:,~np.any(np.isnan(Xi), axis=0)]
Xi = Xi[:, np.var(Xi, axis=0) != 0]
Xis = pp.MinMaxScaler().fit_transform(Xi)
sh2 = np.shape(Xis)
print(f'Shape | filtered/scaled: {sh2}')

In [None]:
# === Heatmap of feature correlation ===
Xs_pd_hm = pd.DataFrame(Xs)
hm1 = sns.heatmap(Xs_pd_hm.corr())
hm1.figure.savefig('temp_out/'+proj+'_Xs.tiff',dpi=300,pil_kwargs={"compression": "tiff_lzw"})

In [None]:
# === Imputed features ===
Xis_pd = pd.DataFrame(Xis)
hm1 = sns.heatmap(Xis_pd.corr())
hm1.figure.savefig('temp_out/'+proj+'_Xis.tiff',dpi=300,pil_kwargs={"compression": "tiff_lzw"})

In [None]:
# === Conduct PCA and display updated heatmap ===
n_comp = min(100,np.shape(Xs_pd)[0])
pca = PCA(n_components=n_comp,random_state=np.random.seed(0))
Xsr = pca.fit_transform(Xs_pd)
Xsr_pd = pd.DataFrame(Xsr)
hm2 = sns.heatmap(Xsr_pd.corr())
hm2.figure.savefig('temp_out/'+proj+'_Xsr.tiff',dpi=300,pil_kwargs={"compression": "tiff_lzw"})

In [None]:
# === PCA for imputed features ===
Xisr = pca.fit_transform(Xis_pd)
Xisr_pd = pd.DataFrame(Xisr)
hm2 = sns.heatmap(Xisr_pd.corr())
hm2.figure.savefig('temp_out/'+proj+'_Xisr.tiff',dpi=300,pil_kwargs={"compression": "tiff_lzw"})

In [None]:
# === Dump variables into pickle file ===
with open('temp_out/'+proj+'_vars.pkl','wb') as f:
    pickle.dump([X, y, Xs, Xis, Xsr, Xisr, Xs_pd], f)

In [None]:
# === Explainable AI via SHAP ===
shap.initjs()
Xd = DMatrix(Xs_pd, label=y)
shap_model = train({'eta':1, 'max_depth':3, 'base_score':0, 'lambda':0}, Xd, 1)
shap_pred = shap_model.predict(Xd, output_margin=True)
explainer = shap.TreeExplainer(shap_model)
explanation = explainer(Xd)
shap_values = explanation.values
fnames = Xs_pd.columns.tolist()

plt.figure()
shap.summary_plot(shap_values, Xs_pd, feature_names=fnames, max_display=5, show=False)
plt.savefig('temp_out/'+proj+'_shap_summary.tiff',dpi=300,pil_kwargs={"compression": "tiff_lzw"})
plt.close()

shap.force_plot(explainer.expected_value, shap_values, Xs_pd)