In [1]:
import os
import tensorflow as tf

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tensorflow.python.keras.layers import Input, Dense
from tensorflow.python.keras.layers import LeakyReLU
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
            print(e)

In [None]:
# Autoencoder based on https://dropsofai.com/autoencoders-in-keras-and-deep-learning/

In [3]:
def unison_shuffled_copies(a, b):
    # https://stackoverflow.com/questions/4601373/better-way-to-shuffle-two-numpy-arrays-in-unison
    # mix two arrays randomly in parallel
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

In [4]:
wildtype_data = np.loadtxt ("thresh0.002_enriched_2-20aa_angle_result.txt") # read in wildtype lcc data
#wildtype_data = wildtype_data [:,1:] # delete first column which is frame number
wildtype_label = np.zeros(len(wildtype_data)) # set wildtype labels to 0 

mutant_1_data = np.loadtxt ("thresh0.002_enriched_2-20aa_angle_result_D132H.txt") # read in mutant lcc data
#mutant_1_data = mutant_1_data [:,1:] # delete first column which is frame number
mutant_1_label = np.ones(len(mutant_1_data)) # set mutant labels to 1

print('Wildtype Training Data Shape:', wildtype_data.shape)
print('Wildtype Label Data Shape:   ', wildtype_label.shape)
print('D132-H   Training Data Shape:', mutant_1_data.shape)
print('D132-H   Label Data Shape:   ', mutant_1_label.shape)

for j in range(1000): # print out examples of random data sets
    i = np.random.randint(0, len(wildtype_data)) # pick a random data set
    plt.plot(wildtype_data[i], color = "navy", alpha = 0.002)
    plt.plot(mutant_1_data[i], color = "red", alpha = 0.002)
plt.savefig("input.png", dpi=300)

Wildtype Training Data Shape: (40000, 101)
Wildtype Label Data Shape:    (40000,)
D132-H   Training Data Shape: (40000, 101)
D132-H   Label Data Shape:    (40000,)


In [5]:
# generate combined and shuffled input data file
lcc_data = np.vstack((wildtype_data, mutant_1_data))
print ("Combined input_data.shape:", lcc_data.shape)

label_data = np.hstack((wildtype_label, mutant_1_label))
print ("Combined label_data.shape:", label_data.shape, "\n")

# here we shuffle both tensors simultaneously to maintain the labels with each data set
lcc_data, label_data = unison_shuffled_copies (lcc_data, label_data)

Combined input_data.shape: (80000, 101)
Combined label_data.shape: (80000,) 



In [6]:
# check that the distribution is still the same after shuffling
# this figure should look very similar to the one above
label_summary = []
test_1 = np.array (lcc_data)
test_2 = np.array (label_data)
for x in range (0, 1000):
    if test_2[x] == 0.0: # identify label and color plot accordingly
        plt.plot (test_1[x], color = "navy", alpha = 0.01);
        label_summary.append(0)
    if test_2[x] == 1.0:
        plt.plot (test_1[x], color = "magenta", alpha = 0.01);
        label_summary.append(1)
plt.savefig("shuffled.png", dpi=300)
print (label_summary [0:20])

[1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0]


In [7]:
#  normalize shuffled input data file
upper_limit          = int(len (lcc_data)) # get total length of concatenated data
upper_training_limit = int(len (lcc_data) * 0.8) # 80% of data used for training
print ("Total number of combined data points:\t\t\t",upper_limit, "\nTotal number of data points selected for training:\t", upper_training_limit, "\n")

lcc_data = lcc_data/180 # normalizing
train_data = lcc_data [0:upper_training_limit,:] # select training data - first 80%
test_data  = lcc_data [upper_training_limit:upper_limit,:] # select last 20% for testing

train_label = label_data [0:upper_training_limit] # select truth data - first 80%
test_label  = label_data [upper_training_limit:upper_limit] # select last 20% for testing

print (train_data.shape, test_data.shape)
print (train_label.shape, test_label.shape)

Total number of combined data points:			 80000 
Total number of data points selected for training:	 64000 

(64000, 101) (16000, 101)
(64000,) (16000,)


In [8]:
#Martina's autoencoder

LeReLU_alpha=0.2
    
#Input layer
#input_data = tf.keras.layers.Input(shape=(190), name='ae_input')
input_data = tf.keras.layers.Input(shape=(101))
    
encoder = Dense(256, activation=LeakyReLU(alpha=LeReLU_alpha), name='e1')(input_data)
encoder = Dense(64,  activation=LeakyReLU(alpha=LeReLU_alpha), name='e2')(encoder)

encoded = Dense(2,   activation=LeakyReLU(alpha=LeReLU_alpha), name='ae_latent')(encoder)
    
decoder = Dense(64,  activation=LeakyReLU(alpha=LeReLU_alpha), name='d1')(encoded)
decoder = Dense(256, activation=LeakyReLU(alpha=LeReLU_alpha), name='d2')(decoder)

output_layer = Dense(train_data.shape[1], activation=LeakyReLU(alpha=LeReLU_alpha), name='ae_output')(decoder)
#decoded = tf.keras.layers.Dense(190)(decoder)

autoencoder = tf.keras.models.Model(input_data, output_layer)

In [9]:
autoencoder = tf.keras.models.Model(inputs = input_data, outputs = output_layer)
autoencoder.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer=tf.keras.optimizers.Adam(learning_rate = 0.005))
autoencoder.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 101)]             0         
_________________________________________________________________
e1 (Dense)                   (None, 256)               26112     
_________________________________________________________________
e2 (Dense)                   (None, 64)                16448     
_________________________________________________________________
ae_latent (Dense)            (None, 2)                 130       
_________________________________________________________________
d1 (Dense)                   (None, 64)                192       
_________________________________________________________________
d2 (Dense)                   (None, 256)               16640     
_________________________________________________________________
ae_output (Dense)            (None, 101)               2595

In [None]:
%%capture
plt.clf()
for counts in range (0, 100): #this determines the number of epoch sets
    history = autoencoder.fit(train_data, train_data, batch_size = 256, 
                              epochs = 1000, validation_data = (test_data, test_data))
    # convert history object to dataframe and plot rates
    training_history = pd.DataFrame(history.history)
    plt.plot (training_history);
    file_name_0 = "history_" + str(counts)
    training_history.to_pickle(file_name_0)
    file_name_1 = str(counts) + ".png"
    plt.savefig(file_name_1, dpi=300)
    plt.clf()
    
    # read in latent layer
    dr_model = tf.keras.models.Model(inputs  = autoencoder.get_layer('e1').input, 
                                     outputs = autoencoder.get_layer('ae_latent').output)
    dr_model.summary()
    
    # put the validation data through current latent layer model
    x = []
    y = []
    z = []
    for i in range(8000):
        z.append(test_label[i])
        op = dr_model.predict(np.array([test_data[i]]))
        x.append(op[0][0])
        y.append(op[0][1])

    df = pd.DataFrame()
    df['x'] = x
    df['y'] = y
    df['z'] = ["trajectory-" + str(k) for k in z]
 
    plt.figure(figsize = (8, 6));
    fig = sns.scatterplot(x = 'x', y='y', hue='z', data=df, s=10)
    file_name_2 = str(counts) + "_latent.png"
    fig.figure.savefig(file_name_2, dpi = 300)
    plt.clf()
    
    #
    df.to_pickle(str(counts))
    file_name = 'models/saved_model_' + str(counts)
    autoencoder.save(file_name)