In [1]:
import numpy as np
import os
import subprocess
import theano

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [2]:
TRAIN_PATH = 'secondary_proteins_prediction/data/cullpdb+profile_6133_filtered.npy.gz'
TEST_PATH =  'secondary_proteins_prediction/data/cb513+profile_split1.npy.gz'

In [3]:
def load_gz(path):  # load a .npy.gz file
    if path.endswith(".gz"):
        f = open(path, 'rb')
        return np.load(f)
    else:
        return np.load(path)

In [4]:
def get_train(path=TRAIN_PATH):
    if not os.path.isfile(path):
        print("Train path is not downloaded ...")
        subprocess.call("./download_train.sh", shell=True)
    else:
        print("Train path is downloaded ...")
    print("Loading train data ...")
    X_in = load_gz(path)
    X = np.reshape(X_in, (5534, 700, 57))
    del X_in
    X = X[:, :, :].astype(theano.config.floatX)
  
    seq_names = np.arange(0, np.size(X, 0))

    #X_train = X[seq_names[0:5278]]
    X_train = X[seq_names[0:5534]]
    X_valid = X[seq_names[5278:5534]]
    
    return X_train, X_valid

In [5]:
X_train, X_valid= get_train(TRAIN_PATH)
print("Done loading train")
print(X_train.shape)

Train path is downloaded ...
Loading train data ...
Done loading train
(5534, 700, 57)


In [6]:
def get_test(path=TEST_PATH):
    if not os.path.isfile(path):
        subprocess.call("./download_test.sh", shell=True)
    print("Loading test data ...")
    X_test_in = load_gz(path)
    X_test = np.reshape(X_test_in, (514, 700, 57))
    del X_test_in
    X_test = X_test[:, :, :].astype(theano.config.floatX)

    return X_test

In [7]:
X_test = get_test(TEST_PATH)
print("Done loading test")
print(X_test.shape)

Loading test data ...
Done loading test
(514, 700, 57)


In [8]:
#To do: Make 1 hot encoded class - from Q8 to Q3
       #Reshape the dataset so it has 700*windowSize seq that map to a class
       #Look inside lasagne to see how we disregard the padding
##

def q8ClassToQ3(q8Labels):
    
    q3 = np.zeros(3)
    index = np.argmax(q8Labels)
    
    #Helix
    if index == 5 or index == 3 or index == 4 : # H ,G, I
        q3[0] = 1   
    #beta    
    if index == 1 or index == 2: # B, E
        q3[1] = 1    
    #coil    
    if index == 7 or index == 6 or index == 0 : # T, S, L
        q3[2] = 1
    return q3

def q8ClassToQ2(q8Labels):
    
    q2 = np.zeros(2)
    index = np.argmax(q8Labels)
    #Helix
    if index == 5: # or index == 3 or index == 4 : # H ,G, I
        q2[0] = 1
    else:
        q2[1] = 1  
    return q2

def q8toQ8(q8Labels):
    return q8Labels

def changeQ8Class(dataSet, reductionFunction, numberOfFeatures):

    num_seqs = np.size(dataSet, 0)
    seqlen = np.size(dataSet, 1)
    labels_new = np.zeros((num_seqs, seqlen, numberOfFeatures))

    for i in range(np.size(dataSet, axis=0)):
        for j in range(np.size(dataSet, axis=1)):
            oneHot = reductionFunction(dataSet[i, j, 22:30])
            features = np.concatenate((dataSet[i, j, 0:21], oneHot), axis=None)
            features = np.concatenate((features, dataSet[i, j, 35:56]), axis=None)
            labels_new[i][j] = features
    return labels_new


In [9]:
def swapClassLabel(features, classLabel, classSize):

    res = np.concatenate((features[0:21], classLabel), axis=None)
    res = np.concatenate((res, features[ (21+classSize) :]), axis=None)
  
    return res

In [10]:
import math

#num_classes should be q8, q3 or maybe q2?
def proteinSequenceToWindowSequence(windowSize, predictionIndex , dataSet, classSize):
   
    num_seqs = np.size(dataSet, 0)
    seqlen = np.size(dataSet, 1)
    features = np.size(dataSet, 2)
    dataSet_new = np.zeros((num_seqs, seqlen - windowSize + 1, windowSize, features))
    
    for i in range(np.size(dataSet, axis=0)):
        if i % 100 == 0:
            print(i)
        for j in range(np.size(dataSet, axis=1) - windowSize):
            classLabel = dataSet[i][j + predictionIndex][21 : (21+classSize) ]
            for k in range(windowSize):
                if predictionIndex != 0:
                    dataSet_new[i][j][k] = swapClassLabel(dataSet[i][j+k], classLabel, classSize)
                else:
                    dataSet_new[i][j][k] = swapClassLabel(dataSet[i][j+k+1], classLabel, classSize)
            
    return dataSet_new    

In [11]:
def removeWindowsWithPadding(dataSet, windowSize, numberOfFeatures):
    
    dataSet = np.reshape(dataSet, (dataSet.shape[0]*dataSet.shape[1], windowSize, numberOfFeatures))
    dataSet = dataSet[np.count_nonzero( dataSet, axis=(1,2))>(int(windowSize/2)*23), :, :] 
    
    return dataSet

In [12]:
def get_reshaped_dataset(X_train, X_valid, reductionFunction, numberOfFeatures, classSize, predictionIndex, windowSize):
    print(X_train.shape)
    X_train = changeQ8Class(X_train, reductionFunction, numberOfFeatures)
    print(X_train.shape, "changed train data to class of size ", classSize)
    X_train_window = proteinSequenceToWindowSequence(windowSize,predictionIndex, X_train, classSize)
    print(X_train_window.shape, "changed train data  to window sequence of size ", windowSize)
    X_train_window = removeWindowsWithPadding(X_train_window , windowSize, numberOfFeatures)
    print(X_train_window.shape, "filtered windows withouth padding of train data ")

    print(X_valid.shape)
    X_valid = changeQ8Class(X_valid, reductionFunction, numberOfFeatures)
    print(X_valid.shape, "changed validation data to class size ", classSize)
    X_valid_window = proteinSequenceToWindowSequence(windowSize,predictionIndex, X_valid, classSize)
    print(X_valid_window.shape, "changed validation data to window sequence of size ", windowSize)
    X_valid_window = removeWindowsWithPadding(X_valid_window , windowSize, numberOfFeatures)
    print(X_valid_window.shape, "filtered windows withouth padding of validation data")
    
    return X_train_window, X_valid_window

In [13]:
def one_hot_to_tapped(oneHot, index, windowSize):

    oneHot = oneHot.copy()
    newOne = 0.5/(windowSize - 1) * index + 0.5
    newZero = (1 - newOne) / (np.size(oneHot) - 1)
        
    oneHot = oneHot.astype(theano.config.floatX)
    oneHot[ oneHot == 1] = newOne
    oneHot[ oneHot == 0] = newZero

    return oneHot

In [14]:
def get_tapped_one_dataset(dataSet, windowSize, classSize):
    
    newDataSet = dataSet.copy()
    for i in range(np.size(newDataSet, axis=1)): 
        for j in range(np.size(newDataSet, axis=1)):

            oneHot = one_hot_to_tapped( newDataSet[i, j, 0:21], j, windowSize)
            features = np.concatenate((oneHot, newDataSet[i, j, 21:]), axis=None)
            newDataSet[i][j] = features
    
    return newDataSet


In [15]:
def get_split(X_train, X_valid, classSize, pssm = False):

    if not pssm:
        return (X_train[:,:,0:21], X_train[:,:,21 : (21+classSize)], 
                X_valid[:,:,0:21], X_valid[:,:,21 : (21+classSize)])
    else:
        return (X_train[:,:,21+classSize:], X_train[:,:,21 : (21+classSize)],
                X_valid[:,:,21+classSize:], X_valid[:,:,21 : (21+classSize)])

In [16]:
windowSize = 19
predictionIndex = 9
classSize = 8  # 2 or 3 
numberOfFeatures = 50 #44 

amino_acid_residues = 21
num_classes = 8

In [None]:
X_train_window, X_valid_window = get_reshaped_dataset(X_train, X_valid, q8toQ8, numberOfFeatures, classSize, predictionIndex, windowSize)

In [None]:
np.save('all_dataset_window19Middle.npy', X_train_window) # save
#np.save('X_valid_window19Middle.npy', X_valid_window)

In [17]:
X_train_window = np.load('X_train_window19Middle.npy') # load
X_valid_window = np.load('X_valid_window19Middle.npy') # load

In [18]:
#a = get_tapped_one_dataset(X_train_window, windowSize, classSize)
#b = get_tapped_one_dataset(X_valid_window, windowSize, classSize)

x_train_final, y_train_final, x_valid_final, y_valid_final = get_split(X_train_window, X_valid_window, classSize, pssm = False)
print(x_train_final.shape, "training data")
print(y_train_final.shape, "labels for training data")
print(x_valid_final.shape, "validation data")
print(y_valid_final.shape, "labels for training validation")

(1082350, 19, 21) training data
(1082350, 19, 8) labels for training data
(51152, 19, 21) validation data
(51152, 19, 8) labels for training validation


In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout, Conv1D, AveragePooling1D, MaxPooling1D, TimeDistributed, LeakyReLU, BatchNormalization, Flatten
from tensorflow.keras import optimizers, callbacks
from tensorflow.keras import regularizers


LR = 0.0005
drop_out = 0.3
batch_dim = 64
nn_epochs = 1
w_reg = regularizers.l2(0.0001)
number_filters = 16

loss = 'categorical_crossentropy'


m = Sequential()

#first convolutional neural netwok
m.add(Conv1D( 64 , 19,  strides=1, padding='same', activation='relu', use_bias=True, input_shape=(windowSize, 21), kernel_regularizer=w_reg))
m.add(BatchNormalization())

m.add(Conv1D( 64, 19,  strides=1, padding='same', activation='relu', use_bias=True, kernel_regularizer=w_reg))
m.add(BatchNormalization())
m.add(Conv1D( 64, 11,  strides=1, padding='same', activation='relu', use_bias=True, kernel_regularizer=w_reg))
m.add(BatchNormalization())
m.add(Conv1D( 64, 3,  strides=1, padding='same', activation='relu', use_bias=True, kernel_regularizer=w_reg))
m.add(BatchNormalization())


m.add(Conv1D( 64, 19,  strides=1, padding='same', activation='relu', use_bias=True, kernel_regularizer=w_reg))
m.add(BatchNormalization())
m.add(Conv1D( 64, 11,  strides=1, padding='same', activation='relu', use_bias=True, kernel_regularizer=w_reg))
m.add(BatchNormalization())
m.add(Conv1D( 64, 3,  strides=1, padding='same', activation='relu', use_bias=True, kernel_regularizer=w_reg))
m.add(BatchNormalization())

m.add(Conv1D( 64, 19,  strides=1, padding='same', activation='relu', use_bias=True, kernel_regularizer=w_reg))
m.add(BatchNormalization())
m.add(Conv1D( 64, 11,  strides=1, padding='same', activation='relu', use_bias=True, kernel_regularizer=w_reg))
m.add(BatchNormalization())
m.add(Conv1D( 64, 3,  strides=1, padding='same', activation='relu', use_bias=True, kernel_regularizer=w_reg))
m.add(BatchNormalization())



#4 dense layer
m.add(Dense(200, activation='relu', use_bias=True,  kernel_regularizer=w_reg))

#5 softmax output layer
m.add(Dense(num_classes, activation = 'softmax'))

opt = optimizers.Adam(lr=LR)
m.compile(optimizer=opt, loss=loss,metrics=['accuracy', 'mae'])

print("\nHyper Parameters\n")
print("Learning Rate: " + str(LR))
print("Drop out: " + str(drop_out))
print("Batch dim: " + str(batch_dim))
print("Number of epochs: " + str(nn_epochs))
print("Regularizers: " + str(w_reg.l2))
print("\nLoss: " + loss + "\n")
m.summary()


Hyper Parameters

Learning Rate: 0.0005
Drop out: 0.3
Batch dim: 64
Number of epochs: 1
Regularizers: 1e-04

Loss: categorical_crossentropy

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 19, 64)            25600     
_________________________________________________________________
batch_normalization (BatchNo (None, 19, 64)            256       
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 19, 64)            77888     
_________________________________________________________________
batch_normalization_1 (Batch (None, 19, 64)            256       
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 19, 64)            45120     
_________________________________________________________________
batch_normalization_2 (Batch (None, 19, 64)   

In [None]:
import keras
from keras.models import Model
from keras.layers import Concatenate, Dense, LSTM, Input, concatenate, Conv1D, BatchNormalization
from keras.optimizers import Adagrad, Adam
from keras.utils import plot_model


LR = 0.0005
drop_out = 0.3
batch_dim = 64
nn_epochs = 1
w_reg = regularizers.l2(0.0001)
number_filters = 16

loss = 'categorical_crossentropy'



input_shape = (windowSize, 21)

conv1_input = Input(shape=(windowSize, 21), name='InputWindow')

conv_1 = Conv1D( 64 , 19,  strides=1, padding='same', activation='relu', use_bias=True,kernel_regularizer=w_reg, name='Network1-filter1')(conv1_input)
conv_1 = BatchNormalization(name='BN1')(conv_1)
conv_2 = Conv1D( 64 , 11,  strides=1, padding='same', activation='relu', use_bias=True,kernel_regularizer=w_reg, name='Network1-filter2')(conv1_input)
conv_2 = BatchNormalization(name='BN2')(conv_2)
conv_3 = Conv1D( 64 , 3,  strides=1, padding='same', activation='relu', use_bias=True,kernel_regularizer=w_reg, name='Network1-filter3')(conv1_input)
conv_3 = BatchNormalization(name='BN3')(conv_3)

merge_1 = concatenate([conv_1, conv_2, conv_3], name='Network1')
input_for_second = concatenate([conv1_input, merge_1], name='Network1-and-input')



conv_4 = Conv1D( 64 , 19,  strides=1, padding='same', activation='relu', use_bias=True,kernel_regularizer=w_reg, name='Network2-filter1')(input_for_second)
conv_4 = BatchNormalization(name='BN4')(conv_4)
conv_5 = Conv1D( 64 , 11,  strides=1, padding='same', activation='relu', use_bias=True,kernel_regularizer=w_reg, name='Network2-filter2')(input_for_second)
conv_5 = BatchNormalization(name='BN5')(conv_5)
conv_6 = Conv1D( 64 , 3,  strides=1, padding='same', activation='relu', use_bias=True,kernel_regularizer=w_reg, name='Network2-filter3')(input_for_second)
conv_6 = BatchNormalization(name='BN6')(conv_6)

merge_2 = concatenate([conv_4, conv_5, conv_6], name='Network2')
input_for_third = concatenate([conv1_input, merge_1, merge_2],name='Network1-Network2-and-input')



conv_7 = Conv1D( 64 , 19,  strides=1, padding='same', activation='relu', use_bias=True,kernel_regularizer=w_reg, name='Network3-filter1')(input_for_third)
conv_7 = BatchNormalization(name='BN7')(conv_7)
conv_8 = Conv1D( 64 , 11,  strides=1, padding='same', activation='relu', use_bias=True,kernel_regularizer=w_reg, name='Network3-filter2')(input_for_third)
conv_8 = BatchNormalization(name='BN8')(conv_8)
conv_9 = Conv1D( 64 , 3,  strides=1, padding='same', activation='relu', use_bias=True,kernel_regularizer=w_reg, name='Network3-filter3')(input_for_third)
conv_9 = BatchNormalization(name='BN9')(conv_9)

merge_3 = concatenate([conv_7, conv_8, conv_9],name='Network3')

merge_final = concatenate([merge_1, merge_2, merge_3], name='Final')




first_dense = Dense(200, activation='relu', use_bias=True,  kernel_regularizer=w_reg, name='last')(merge_final)
first_dense = BatchNormalization(name='BN10')(first_dense)
final_model_output = Dense(num_classes, activation = 'softmax', name='softmax')(first_dense)

m = Model(inputs=conv1_input, outputs=final_model_output)

opt = Adam(lr=LR)
m.compile(optimizer=opt, loss=loss,metrics=['accuracy', 'mae'])

print("\nHyper Parameters\n")
print("Learning Rate: " + str(LR))
print("Drop out: " + str(drop_out))
print("Batch dim: " + str(batch_dim))
print("Number of epochs: " + str(nn_epochs))
print("Regularizers: " + str(w_reg.l2))
print("\nLoss: " + loss + "\n")
m.summary()
import os
os.environ["PATH"] += os.pathsep + 'C:/Users/Ieremie/Anaconda3/pkgs/graphviz-2.38-hfd603c8_2/Library/bin'
from keras.utils import plot_model
plot_model(m, to_file='model.png')

In [20]:
from time import time
from timeit import default_timer as timer

start_time = timer()
history = m.fit(x_train_final, y_train_final, epochs=nn_epochs, batch_size=batch_dim, validation_data=(x_valid_final, y_valid_final) ,shuffle=True)

end_time = timer()
print("\n\nTime elapsed: " + "{0:.2f}".format((end_time - start_time)) + " s")

Train on 1082350 samples, validate on 51152 samples
 156480/1082350 [===>..........................] - ETA: 14:10 - loss: 1.3974 - accuracy: 0.5129 - mae: 0.1544

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

accuracyName = 'accuracyMiddlewindowQ2W19Model2.png'
lossName = 'lossMiddlewindowQ2W19Model2.png'

# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
#plt.savefig(accuracyName)
plt.show()


# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
#plt.savefig(lossName)
plt.show()




In [None]:
import pickle

file_name = "19-11-3-doubleFull"
with open(file_name, 'wb') as file_pi:
        pickle.dump(history.history, file_pi)

In [None]:
from keras.models import load_model
m.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'

In [21]:
X_test = changeQ8Class(X_test, q8toQ8, numberOfFeatures)
X_test_window = proteinSequenceToWindowSequence(windowSize,predictionIndex, X_test, classSize)
X_test_window = removeWindowsWithPadding(X_test_window , windowSize, numberOfFeatures)
print(X_test_window.shape)


0
100
200
300
400
500
(80119, 19, 50)


In [None]:
c = get_tapped_one_dataset(X_test_window, windowSize, classSize)
print(c.shape)

In [22]:

x_test_final = X_test_window[:,:,(21+classSize):]
#x_test_final = X_test_window[:,:,0:21]
y_test_final = X_test_window[:,:,21: (21+classSize)]
print(x_test_final.shape)
print(y_test_final.shape)

(80119, 19, 21)
(80119, 19, 8)


In [None]:
print(x_train_final[1])


In [None]:
result = m.predict(x_train_final[0:1])
result.reshape(19,8)
print(result[0][5].sum())

In [23]:
scores = m.evaluate(x_test_final, y_test_final)
print("Loss: " + str(scores[0]) + ", Accuracy: " + str(scores[1]) + ", MAE: " + str(scores[2]))
print("yes boi")

Loss: 4.2068291160391045, Accuracy: 0.3433938, MAE: 0.1668269
yes boi
