In [None]:
import pandas as pd
import numpy as np

demography_df = pd.read_csv("patient_demographic.csv")
demography_df['date'] = pd.to_datetime(demography_df['date'])

In [None]:
seed = 5571

In [None]:
from sklearn.model_selection import train_test_split

def split_dats(df, seed):
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=seed, stratify=df['Label'])
    train_df, early_df = train_test_split(train_df, test_size=len(test_df), random_state=seed, stratify=train_df['Label'])
    
    del train_df['Label']
    del early_df['Label']
    del test_df['Label']


    return train_df, early_df, test_df

In [None]:
train_df, early_df, test_df = split_dats(demography_df, seed)

In [None]:
def calc_age(row):
    return row['timestamp'].year - row['date'].year + row['age']
    
df = pd.read_csv("feature_selected.csv")
df['timestamp'] = pd.to_datetime(df['timestamp'])


train_df = pd.merge(df, train_df, on='patient_id')
train_df['age'] = train_df.apply(calc_age, axis=1)
train_df.insert(4,'age2',0)
train_df.insert(5,'sex2',0)
train_df['sex2'] = train_df['sex']
train_df['age2'] = train_df['age']
train_df.drop(['age','sex','date'], axis=1, inplace=True)
train_df.rename(columns={'age2':'age','sex2':'sex'}, inplace=True)

In [None]:
early_df = pd.merge(df, early_df, on='patient_id')
early_df['age'] = early_df.apply(calc_age, axis=1)
early_df.insert(4,'age2',0)
early_df.insert(5,'sex2',0)
early_df['sex2'] = early_df['sex']
early_df['age2'] = early_df['age']
early_df.drop(['age','sex','date'], axis=1, inplace=True)
early_df.rename(columns={'age2':'age','sex2':'sex'}, inplace=True)

test_df = pd.merge(df, test_df, on='patient_id')
test_df['age'] = test_df.apply(calc_age, axis=1)
test_df.insert(4,'age2',0)
test_df.insert(5,'sex2',0)
test_df['sex2'] = test_df['sex']
test_df['age2'] = test_df['age']
test_df.drop(['age','sex','date'], axis=1, inplace=True)
test_df.rename(columns={'age2':'age','sex2':'sex'}, inplace=True)

In [None]:
def make_feature(df):
    patients = np.unique(df['patient_id'])
    feature = []
    target = []
    
    for p in patients:
        df_tmp = df[df['patient_id']==p]
        arr = df_tmp.values[:, :]
        
        for i in range(7, len(arr)-7):
            tmp_f = arr[i-7:i, 2:-1]
            tmp_t = arr[i+6,-1]
    
            if pd.isna(tmp_t) or pd.isna(tmp_f).sum() >= 1:
                continue
    
            feature.append(tmp_f)
            target.append(tmp_t)

    return np.array(feature).astype(float), np.array(target).reshape(-1, 1)

train_feature, train_target = make_feature(train_df)
earlystop_feature, earlystop_target = make_feature(early_df)
test_feature, test_target = make_feature(test_df)

In [None]:
shape = train_feature.shape

In [None]:
from sklearn.preprocessing import MinMaxScaler
import copy

train_feature = train_feature.reshape(len(train_feature), -1)
earlystop_feature = earlystop_feature.reshape(len(earlystop_feature), -1)
test_feature = test_feature.reshape(len(test_feature), -1)

scaler = MinMaxScaler()
train_feature = scaler.fit_transform(train_feature)
earlystop_feature = scaler.transform(earlystop_feature)
test_feature = scaler.transform(test_feature)

In [None]:
test_feature = test_feature.reshape(len(test_feature), shape[1], shape[2]).astype(float)

In [None]:
def make_multi_label(arr_target):
    tmp_target = []
    for i in range(len(arr_target)):
        row = []
        if arr_target[i][0] == 0:
            row.append(0)
        elif arr_target[i][0] == 1:
            row.append(1)
        elif arr_target[i][0] == 2:
            row.append(1)
        elif arr_target[i][0] == 4:
            row.append(1)

        tmp_target.append(row)

    return np.array(tmp_target).reshape(len(tmp_target), -1)


test_target = make_multi_label(test_target)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Masking, LSTM, Dropout, MultiHeadAttention, Flatten, Dense, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.metrics import Precision, Recall, PrecisionAtRecall
from tensorflow.keras.losses import BinaryFocalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import ExponentialDecay

model = tf.keras.models.load_model('LSTM_Feature_DLMO_CR_{}.h5'.format(seed))

model.compile(optimizer=Adam(learning_rate=ExponentialDecay(0.000001, decay_steps=1000, decay_rate=0.99)),loss=BinaryFocalCrossentropy(),metrics=[Precision(), Recall(), PrecisionAtRecall(0.7)])
model.summary()

In [None]:
import shap
explainer = shap.GradientExplainer(model, test_feature)
shap_values = explainer.shap_values(test_feature)

In [None]:
columns = ['Age',
    'Sex',
    'Daytime HR Mean',
    'Daytime HR Q1',
    'Daytime HR Median',
    'Daytime HR Q3',
    'Evening HR Mean',
    'Evening HR Q1',
    'Evening HR Median',
    'Evening HR Q3',
    'Bedtime HR Mean',
    'Bedtime HR Median',
    'Bedtime HR Q3',
    'HRV 20th Percentile',
    'HRV Minimum NN',
    'Steps < 7000 (Daily)',
    'Steps 7000–8000 (Daily)',
    'Steps > 8000 (Daily)',
    'Steps Nan (Daily)',
    'Below Personal Step Avg',
    'Above Personal Step Avg',
    'Nan Personal Step Avg',
    'Main Sleep 0h–6h',
    'Main Sleep 6h–12h',
    'Main Sleep 12h–18h',
    'Main Sleep 18h–24h',
    'Main Sleep Time Nan',
    'Minutes Awake During Sleep',
    'DLMO (Method 1)',
    'DLMO (Method 1) Wakeup',
    'DLMO (Medthd 2)',
    'DLMO (Medthd 2) Wakeup',
    'Acrophase (48h Cosine fit) ',
    'MESOR (48h Cosine fit)',
    'Acrophase Advanced(48h Cosine fit)',
    'Acrophase Normal(48h Cosine fit)',
    'Acrophase Delayed(48h Cosine fit)',
    'Acrophase Nan(48h Cosine fit)',
    'Acrophase (168h Cosine fit) ',
    'MESOR (168h Cosine fit)',
    'Acrophase Advanced(168h Cosine fit)',
    'Acrophase Normal(168h Cosine fit)',
    'Acrophase Delayed(168h Cosine fit)',
    'Acrophase Nan(168h Cosine fit)']

In [None]:
shap_columns = []
for t in range(6,-1,-1):
    for c in columns:
        if t == 0:
            shap_columns.append("today {}".format(c))
        elif t > 1:
            shap_columns.append("{}days ago {}".format(t,c))
        else:
            shap_columns.append("{}day ago {}".format(t,c))

shap_values = shap_values
shap_values = shap_values.reshape(len(shap_values), -1)
tmp_x = pd.DataFrame(test_feature.reshape(len(test_feature),-1), columns=shap_columns)

In [None]:
utilsize_index = []
idx = 0
for t in range(6,-1,-1):
    for c in list(train_df.columns)[2:-1]:
        if c == 'age' or c == 'sex':
            pass    
        elif c.endswith('average_step_over_no'):
            pass
        else:
            utilsize_index.append(idx)
        idx = idx + 1

In [None]:
utilsize_columns = []
for i,c in enumerate(tmp_x.columns):
    if i in utilsize_index:
        utilsize_columns.append(c)

In [None]:
import matplotlib.pyplot as plt

shap.summary_plot(shap_values[:,utilsize_index], tmp_x[utilsize_columns], show=False)
plt.tight_layout()
plt.savefig('shap_importance.png', dpi=300)
plt.show()

In [None]:
import numpy as np

# 평균 절댓값 기준 중요도 계산
mean_abs_shap = np.abs(shap_values[:, utilsize_index]).mean(axis=0)

# 중요도 내림차순 정렬된 feature 인덱스
sorted_indices = np.argsort(mean_abs_shap)[::-1]

In [None]:
top_10_indices = sorted_indices[:10]
top_10_columns = [utilsize_columns[i] for i in top_10_indices]

# SHAP summary plot (상위 1~10개)
shap.summary_plot(
    shap_values[:, utilsize_index][:, top_10_indices],
    tmp_x[top_10_columns],
    feature_names=top_10_columns,
    show=False
)
plt.tight_layout()
plt.savefig("shap_top_1_10.png", dpi=300)
plt.show()

In [None]:
top_11_20_indices = sorted_indices[10:20]
top_11_20_columns = [utilsize_columns[i] for i in top_11_20_indices]

shap.summary_plot(
    shap_values[:, utilsize_index][:, top_11_20_indices],
    tmp_x[top_11_20_columns],
    feature_names=top_11_20_columns,
    show=False
)
plt.tight_layout()
plt.savefig("shap_top_11_20.png", dpi=300)
plt.show()

In [None]:
tmp_x_inverse = pd.DataFrame(scaler.inverse_transform(tmp_x), columns=tmp_x.columns)
tmp_x_inverse

In [None]:
columns = ['Age',
    'Sex',
    'Daytime HR Mean',
    'Daytime HR Q1',
    'Daytime HR Median',
    'Daytime HR Q3',
    'Evening HR Mean',
    'Evening HR Q1',
    'Evening HR Median',
    'Evening HR Q3',
    'Bedtime HR Mean',
    'Bedtime HR Median',
    'Bedtime HR Q3',
    'HRV 20th Percentile',
    'HRV Minimum NN',
    'Steps < 7000 (Daily)',
    'Steps 7000–8000 (Daily)',
    'Steps > 8000 (Daily)',
    'Steps Nan (Daily)',
    'Below Personal Step Avg',
    'Above PersonalStep Avg',
    'Nan PersonalStep Avg',
    'Main Sleep 0h–6h',
    'Main Sleep 6h–12h',
    'Main Sleep 12h–18h',
    'Main Sleep 18h–24h',
    'Main Sleep Time Nan',
    'Minutes Awake During Sleep',
    'DLMO (Method 1)',
    'DLMO (Method 1) Wakeup',
    'DLMO (Medthd 2)',
    'DLMO (Medthd 2) Wakeup',
    'Acrophase (48h Cosine fit) ',
    'MESOR (48h Cosine fit)',
    'Acrophase Advanced(48h Cosine fit)',
    'Acrophase Normal(48h Cosine fit)',
    'Acrophase Delayed(48h Cosine fit)',
    'Acrophase Nan(48h Cosine fit)',
    'Acrophase (168h Cosine fit) ',
    'MESOR (168h Cosine fit)',
    'Acrophase Advanced(168h Cosine fit)',
    'Acrophase Normal(168h Cosine fit)',
    'Acrophase Delayed(168h Cosine fit)',
    'Acrophase Nan(168h Cosine fit)']

In [None]:
df = pd.DataFrame(shap_values, columns=shap_columns)

In [None]:
import numpy as np

arr= df.values

plus_list_squared_sum_sqrt = []
minus_list_squared_sum_sqrt = []
tmp_list = []

for i in range(len(arr[0])):
    tmp = arr[:, i]
    tmp_list.append(np.sqrt(np.sum(np.power(tmp,2))))

    tmp = tmp[tmp >= 0]
    plus_list_squared_sum_sqrt.append(np.sqrt(np.sum(np.power(tmp,2))))
    
    tmp = arr[:, i]
    tmp = tmp[tmp < 0]
    minus_list_squared_sum_sqrt.append(-np.sqrt(np.sum(np.power(tmp,2))))

In [None]:
tmp_list = np.array(tmp_list).reshape(7,-1)

In [None]:
summary_columns = ['age','sex','Heart Rate Variables', 'Physical Activity Variables', 'Sleep Variables', 'DLMO Variables', 'Cosine Fitting Variables', 'Sum']

summary_list = []

for i in range(len(tmp_list)):
    tmp = []
    tmp.append(tmp_list[i][0])
    tmp.append(tmp_list[i][1])
    tmp.append(np.sqrt(np.sum(np.power(tmp_list[i][2:15],2))))
    tmp.append(np.sqrt(np.sum(np.power(tmp_list[i][15:22],2))))
    tmp.append(np.sqrt(np.sum(np.power(tmp_list[i][22:28],2))))
    tmp.append(np.sqrt(np.sum(np.power(tmp_list[i][28:32],2))))
    tmp.append(np.sqrt(np.sum(np.power(tmp_list[i][32:],2))))
    summary_list.append(tmp)

In [None]:
summary_list = np.array(summary_list).reshape(7,-1)

In [None]:
summary_list_tmp = np.zeros((summary_list.shape[0]+1, (summary_list.shape[1]+1)))
for i in range(len(summary_list)):
    for j in range(len(summary_list[i])):
        summary_list_tmp[i+1,j] = summary_list[i,j]

for j in range(len(summary_list[0])):
    summary_list_tmp[0][j] = np.sum(summary_list[:,j])

for i in range(len(summary_list_tmp)):
    summary_list_tmp[i][-1] = np.sum(summary_list_tmp[i,2:-1])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

custom_cmap = LinearSegmentedColormap.from_list('custom_cmap', ['white', 'green'])

tmp_df = pd.DataFrame(summary_list_tmp, columns=summary_columns, index=['Sum','6days ago', '5days ago', '4days ago', '3days ago', '2days ago', '1day ago', 'today'])
tmp_df.drop(['age','sex'],axis=1, inplace=True)
tmp_df_figure = tmp_df.T
plt.figure(figsize=(10,4), dpi=300)
sns.heatmap(tmp_df_figure, annot=True, cmap=custom_cmap, linewidths=0.5, fmt='.2f')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('shap_summary.png', dpi=300)
plt.show()

In [None]:
custom_cmap = LinearSegmentedColormap.from_list('custom_cmap', ['white', 'green'])

tmp_df = pd.DataFrame(tmp_list, columns=columns, index=['6days ago', '5days ago', '4days ago', '3days ago', '2days ago', '1day ago', 'today'])
tmp_df.drop(['Age','Sex'],axis=1, inplace=True)
tmp_df.drop(['Steps Nan (Daily)','Nan PersonalStep Avg', 'Main Sleep Time Nan',     'Acrophase Nan(48h Cosine fit)',    'Acrophase Nan(168h Cosine fit)',],axis=1, inplace=True)
tmp_df_figure = tmp_df.T
plt.figure(figsize=(10, 10))
sns.heatmap(tmp_df_figure, annot=True, cmap=custom_cmap, linewidths=0.5, fmt='.2f')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('shap_summary_detail.png', dpi=300)
plt.show()

In [None]:
plus_list_squared_sum_sqrt = np.array(plus_list_squared_sum_sqrt).reshape(7,-1)
minus_list_squared_sum_sqrt = np.array(minus_list_squared_sum_sqrt).reshape(7,-1)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
custom_cmap = LinearSegmentedColormap.from_list('custom_cmap', ['white', 'red'])

tmp_df = pd.DataFrame(plus_list_squared_sum_sqrt, columns=columns, index=['6days ago', '5days ago', '4days ago', '3days ago', '2days ago', '1day ago', 'today'])
tmp_df.drop(['Steps Nan (Daily)','Nan PersonalStep Avg', 'Main Sleep Time Nan',     'Acrophase Nan(48h Cosine fit)',    'Acrophase Nan(168h Cosine fit)',],axis=1, inplace=True)
tmp_df.drop(['Age','Sex'],axis=1, inplace=True)
tmp_df_figure = tmp_df.T
plt.figure(figsize= (6.5,10))
sns.heatmap(tmp_df_figure, annot=False, cmap=custom_cmap, linewidths=0.5, fmt='.2f')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('shap_positive_none_annotation.png', dpi=300)
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
custom_cmap = LinearSegmentedColormap.from_list('custom_cmap', ['blue', 'white'])


tmp_df = pd.DataFrame(minus_list_squared_sum_sqrt, columns=columns, index=['6days ago', '5days ago', '4days ago', '3days ago', '2days ago', '1day ago', 'today'])
tmp_df.drop(['Age','Sex'],axis=1, inplace=True)
tmp_df.drop(['Steps Nan (Daily)','Nan PersonalStep Avg', 'Main Sleep Time Nan',     'Acrophase Nan(48h Cosine fit)',    'Acrophase Nan(168h Cosine fit)',],axis=1, inplace=True)
tmp_df_figure = tmp_df.T
plt.figure(figsize= (6.5,10))
sns.heatmap(tmp_df_figure, annot=False, cmap=custom_cmap, linewidths=0.5, fmt='.2f')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('shap_negativenone_annotation.png', dpi=300)
plt.show()