In [1]:
import pickle
import scipy.constants
import datetime
import tensorflow as tf
import numpy as np
import tqdm
import os
import pandas as pd
import random
import sklearn.model_selection
random.seed(42)

In [2]:
import hchs_data_pre_processing
import hchs_transformations
import simclr_models
import simclr_utitlities
import simclr_predictions
import chapman_data_pre_processing
import chapman_transformations

In [25]:
working_directory = 'test_run_chapman/'
if not os.path.exists(working_directory):
    os.makedirs(working_directory)
dataset_save_path = os.path.join(os.getcwd(), "PickledData", "chapman")
user_datasets_path = os.path.join(dataset_save_path, "four_lead_user_datasets.pickle")
path_to_test_train_split_dict = os.path.join(dataset_save_path, "test_train_split_dict.pickle")
path_to_patient_to_rhythm_dict = os.path.join(dataset_save_path, "patient_to_rhythm_dict.pickle")
# for testing 
path_to_test_train_split_reduced_dict = os.path.join(dataset_save_path, '100_users_reduced_test_train_split_dict.pickle')
path_to_reduced_four_lead_user_datasets = os.path.join(dataset_save_path, '100_users_datasets.pickle')

testing = True
if testing:
    user_datasets_path = path_to_reduced_four_lead_user_datasets
    test_train_split_dict_path = path_to_test_train_split_reduced_dict

else:
    user_datasets_path = user_datasets_path
    test_train_split_dict_path = path_to_test_train_split_dict


# Load Chapman Data

In [4]:
# {patient_id:(data, leads)}
# data of shape (4, 2500)
with open(user_datasets_path, 'rb') as f:
    user_datasets = pickle.load(f)

In [5]:
sample_key = list(user_datasets.keys())[0]
user_datasets[sample_key][0].shape

(2500, 4)

# Pre Processing

In [6]:
# Parameters

# CHECK
number_of_leads = 4
window_size = 200
input_shape = (window_size, number_of_leads)

# Dataset Metadata 
transformation_multiple = 1
dataset_name = 'chapman.pkl'
dataset_name_user_split = 'chapman_user_split.pkl'

# label_list = [0, 1, 2, 3]
# label_list_full_name = ['II', 'AVR', 'AVL', 'V2']
# has_null_class = False

# label_map = dict([(label, fullname) for label, fullname in zip(label_list, label_list_full_name)])
# since we have already applied the encoding 
# label_map = dict([(label, label) for label in label_list])
# output_shape = len(label_list)

model_save_name = f"chapman_acc"


unit_conversion = scipy.constants.g

# a fixed user-split

with open(test_train_split_dict_path, 'rb') as f:
    test_train_user_dict = pickle.load(f)

test_users = test_train_user_dict['test']
train_users = test_train_user_dict['train']

print(f'Test Numbers: {len(test_users)}, Train Numbers: {len(train_users)}')

Test Numbers: 20, Train Numbers: 80


In [7]:
user_datasets_windowed = chapman_data_pre_processing.get_windows_dataset_from_user_list_format(user_datasets, window_size=window_size, shift=window_size//2)

user_datasets_windowed[sample_key].shape

(24, 200, 4)

In [8]:
for user in user_datasets_windowed.keys():
    if user in test_users:
        print(user_datasets_windowed[user].shape)

(24, 200, 4)
(24, 200, 4)
(24, 200, 4)
(24, 200, 4)
(24, 200, 4)
(24, 200, 4)
(24, 200, 4)
(24, 200, 4)
(24, 200, 4)
(24, 200, 4)
(24, 200, 4)
(24, 200, 4)
(24, 200, 4)
(24, 200, 4)
(24, 200, 4)
(24, 200, 4)
(24, 200, 4)
(24, 200, 4)
(24, 200, 4)
(24, 200, 4)


In [9]:
train_x, test_x = chapman_data_pre_processing.combine_windowed_dataset(user_datasets_windowed, train_users)

print(train_x.shape)
print(test_x.shape)

(1920, 200, 4)
(480, 200, 4)


In [10]:
train_val_x_split = np.split(train_x, [int(0.75*train_x.shape[0])])
train_x_split, val_x_split = train_val_x_split[0], train_val_x_split[1]

In [11]:
print(train_x_split.shape)
print(val_x_split.shape)
print(test_x.shape)

(1440, 200, 4)
(480, 200, 4)
(480, 200, 4)


In [10]:
user_windowed_list = chapman_data_pre_processing.get_window_to_user_mapping(user_datasets, train_users, test_users, window_size)

In [11]:
train_window_list, val_window_list, test_window_list = user_windowed_list[0], user_windowed_list[1], user_windowed_list[2]

In [12]:
np_train, np_val, np_test = chapman_data_pre_processing.pre_process_dataset_composite(
    user_datasets=user_datasets, 
    train_users=train_users, 
    test_users=test_users, 
    window_size=window_size, 
    shift=window_size//2, 
    normalise_dataset=True, 
    verbose=1
)

print(np_train.shape)
print(np_val.shape)
print(np_test.shape)

step 1 done
step 2 done
step 3 done
step 4 done
step 5 done
(1440, 200, 4)
(480, 200, 4)
(480, 200, 4)
step 6 done
(1440, 200, 4)
(480, 200, 4)
(480, 200, 4)


In [13]:
# as suggested by Chen et al. in CLOCS 
batch_size = 256
initial_learning_rate = 0.0001
decay_steps = 1000
if testing:
    epochs = 3
else:
    epochs = 200

temperature = 0.1
trasnformation_indices = [1, 2] # Use Scaling and rotation trasnformation

transform_funcs_vectorised = [
    chapman_transformations.noise_transform_vectorized, 
    chapman_transformations.scaling_transform_vectorized, 
    chapman_transformations.negate_transform_vectorized, 
    chapman_transformations.time_flip_transform_vectorized, 
    chapman_transformations.time_segment_permutation_transform_improved, 
    chapman_transformations.time_warp_transform_low_cost, 
    chapman_transformations.channel_shuffle_transform_vectorized
]
# transform_funcs_names = ['noised', 'scaled', 'rotated', 'negated', 'time_flipped', 'permuted', 'time_warped', 'channel_shuffled']
transform_funcs_names = ['noised', 'scaled', 'negated', 'time_flipped', 'permuted', 'time_warped', 'channel_shuffled']



In [14]:
start_time = datetime.datetime.now()
start_time_str = start_time.strftime("%Y%m%d-%H%M%S")
print(start_time_str)
tf.keras.backend.set_floatx('float32')

lr_decayed_fn = tf.keras.experimental.CosineDecay(initial_learning_rate=initial_learning_rate, decay_steps=decay_steps)
optimizer = tf.keras.optimizers.SGD(lr_decayed_fn)
transformation_function = simclr_utitlities.generate_combined_transform_function(transform_funcs_vectorised, indices=trasnformation_indices)

base_model = simclr_models.create_base_model(input_shape, model_name="base_model")
simclr_model = simclr_models.attach_simclr_head(base_model)
simclr_model.summary()

trained_simclr_model, epoch_losses = simclr_utitlities.simclr_train_model(simclr_model, np_train, optimizer, batch_size, transformation_function, temperature=temperature, epochs=epochs, is_trasnform_function_vectorized=True, verbose=1)

simclr_model_save_path = f"{working_directory}{start_time_str}_simclr.hdf5"
trained_simclr_model.save(simclr_model_save_path)



20210227-163640
<function scaling_transform_vectorized at 0x000001FFD13E0280>
<function negate_transform_vectorized at 0x000001FFD13E0310>
Model: "base_model_simclr"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 200, 4)]          0         
_________________________________________________________________
conv1d (Conv1D)              (None, 177, 32)           3104      
_________________________________________________________________
dropout (Dropout)            (None, 177, 32)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 162, 64)           32832     
_________________________________________________________________
dropout_1 (Dropout)          (None, 162, 64)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 155, 9

In [12]:

# if testing:
#     total_epochs = 3
#     batch_size = 32
# else:
#     total_epochs = 50
#     batch_size = 256
# tag = "linear_eval"

# simclr_model = tf.keras.models.load_model(simclr_model_save_path)
# linear_evaluation_model = simclr_models.create_linear_model_from_base_model(simclr_model, output_shape, intermediate_layer=7)

# best_model_file_name = f"{working_directory}{start_time_str}_simclr_{tag}.hdf5"
# best_model_callback = tf.keras.callbacks.ModelCheckpoint(best_model_file_name,
#     monitor='val_loss', mode='min', save_best_only=True, save_weights_only=False, verbose=0
# )

# training_history = linear_evaluation_model.fit(
#     x = np_train[0],
#     y = np_train[1],
#     batch_size=batch_size,
#     shuffle=True,
#     epochs=total_epochs,
#     callbacks=[best_model_callback],
#     validation_data=np_val
# )

# best_model = tf.keras.models.load_model(best_model_file_name)

# print("Model with lowest validation Loss:")
# print(simclr_utitlities.evaluate_model_simple(best_model.predict(np_test[0]), np_test[1], return_dict=True))
# print("Model in last epoch")
# print(simclr_utitlities.evaluate_model_simple(linear_evaluation_model.predict(np_test[0]), np_test[1], return_dict=True))


In [None]:
print("starting tsne")

target_model = simclr_model 
perplexity = 30.0
intermediate_model = simclr_models.extract_intermediate_model_from_base_model(target_model, intermediate_layer=7)
intermediate_model.summary()

embeddings = intermediate_model.predict(np_test[0], batch_size=batch_size)
tsne_model = sklearn.manifold.TSNE(perplexity=perplexity, verbose=1, random_state=42)
tsne_projections = tsne_model.fit_transform(embeddings)
print("done projections")

labels_argmax = np.argmax(np_test[1], axis=1)
unique_labels = np.unique(labels_argmax)

plt.figure(figsize=(16,8))
graph = sns.scatterplot(
    x=tsne_projections[:,0], y=tsne_projections[:,1],
    hue=labels_argmax,
    palette=sns.color_palette("hsv", len(unique_labels)),
    s=50,
    alpha=1.0,
    rasterized=True
)
plt.xticks([], [])
plt.yticks([], [])


plt.legend(loc='lower left', bbox_to_anchor=(0.25, -0.3), ncol=2)
legend = graph.legend_
for j, label in enumerate(unique_labels):
    legend.get_texts()[j].set_text(label_list_full_name[label]) 

tsne_save_name = f"{start_time_str}_tsne.png"
tsne_plt_save_path = os.path.join(os.getcwd(), "plots", "tsne", "chapman", tsne_save_name)
plt.savefig(tsne_plt_save_path)
plt.show()

In [16]:
# get window to user mappings
train_val_test_user_window_list = chapman_data_pre_processing.get_window_to_user_mapping(user_datasets, train_users, test_users, window_size)


# get window level embeddings 
intermediate_model = simclr_models.extract_intermediate_model_from_base_model(trained_simclr_model, intermediate_layer=7)
intermediate_model.summary()

np_train_val_test = [np_train, np_val, np_test]

embeddings_train_val_test_dataframes = []
for np_dataset in np_train_val_test:
    embedding = intermediate_model.predict(np_dataset, batch_size=batch_size)
    embedding_dataframe = pd.DataFrame(embedding)
    embeddings_train_val_test_dataframes.append(embedding_dataframe)

Model: "base_model_simclr_layer_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 200, 4)]          0         
_________________________________________________________________
conv1d (Conv1D)              (None, 177, 32)           3104      
_________________________________________________________________
dropout (Dropout)            (None, 177, 32)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 162, 64)           32832     
_________________________________________________________________
dropout_1 (Dropout)          (None, 162, 64)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 155, 96)           49248     
_________________________________________________________________
dropout_2 (Dropout)          (None, 155, 

In [17]:
aggregate = 'mean'
train_val_test_user_level_activations = []
for embedding_dataframe, user_window_list in zip(embeddings_train_val_test_dataframes, train_val_test_user_window_list):
    embedding_dataframe.index = user_window_list
    if aggregate == 'mean':
        user_level_activation = embedding_dataframe.groupby(embedding_dataframe.index).mean()
    elif aggregate == 'std':
        user_level_activation = embedding_dataframe.groupby(embedding_dataframe.index).std()
    elif aggregate == 'min':
        user_level_activation = embedding_dataframe.groupby(embedding_dataframe.index).min()
    elif aggregate == 'max':
        user_level_activation = embedding_dataframe.groupby(embedding_dataframe.index).max()
    else:
        user_level_activation = embedding_dataframe.groupby(embedding_dataframe.index).median()

    train_val_test_user_level_activations.append(user_level_activation)

In [26]:
with open(path_to_patient_to_rhythm_dict, 'rb') as f:
    patient_to_rhythm_dict = pickle.load(f)

In [29]:
train_user_level_activations = train_val_test_user_level_activations[0]

In [35]:
train_user_window_list = list(train_user_level_activations.index.values)
label_list = [patient_to_rhythm_dict[user] for user in train_user_window_list]
label_list

['GSVT',
 'SB',
 'SB',
 'GSVT',
 'AFIB',
 'AFIB',
 'SB',
 'SR',
 'SB',
 'GSVT',
 'AFIB',
 'AFIB',
 'GSVT',
 'AFIB',
 'SB',
 'SR',
 'SB',
 'SB',
 'SB',
 'SB',
 'AFIB',
 'SB',
 'GSVT',
 'SB',
 'SB',
 'SB',
 'GSVT',
 'SB',
 'SB',
 'SB',
 'AFIB',
 'GSVT',
 'AFIB',
 'SB',
 'SB',
 'AFIB',
 'GSVT',
 'SB',
 'AFIB',
 'SB',
 'SB',
 'SB',
 'SB',
 'SB',
 'SB',
 'GSVT',
 'SR',
 'SR',
 'SR',
 'SR',
 'SR',
 'SR',
 'SR',
 'SR',
 'SR',
 'SR',
 'GSVT',
 'GSVT',
 'GSVT',
 'AFIB']