In [2]:
import numpy as np
import os
import subprocess
import theano

In [1]:
#!pip install --upgrade utils

import sys
!conda install --yes --prefix {sys.prefix} theano

#import sys
#!conda install --yes --prefix {sys.prefix} keras

#import sys
#!{sys.executable} -m pip install --upgrade keras


#import sys
#!conda install --yes --prefix {sys.prefix} tensorflow-gpu

#import sys
#!conda install --yes --prefix {sys.prefix} scipy

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [4]:
TRAIN_PATH = 'secondary_proteins_prediction/data/cullpdb+profile_6133_filtered.npy.gz'
TEST_PATH = 'secondary_proteins_prediction/data/cb513+profile_split1.npy.gz'

In [5]:
def load_gz(path):  # load a .npy.gz file
    if path.endswith(".gz"):
        f = open(path, 'rb')
        return np.load(f)
    else:
        return np.load(path)

In [6]:
##### TRAIN DATA #####

def get_train(path=TRAIN_PATH):
    if not os.path.isfile(path):
        print("Train path is not downloaded ...")
        subprocess.call("./download_train.sh", shell=True)
    else:
        print("Train path is downloaded ...")
    print("Loading train data ...")
    X_in = load_gz(path)
    X = np.reshape(X_in, (5534, 700, 57))
    del X_in
    X = X[:, :, :]
    labels = X[:, :, 22:30]
    mask = X[:, :, 30] * -1 + 1

    a = np.arange(0, 21)
    b = np.arange(35, 56)
    c = np.hstack((a, b))
  #  X = X[:, :, c]
    
    # getting meta
    num_seqs = np.size(X, 0)
    seqlen = np.size(X, 1)
    d = np.size(X, 2)
    num_classes = 8

    #### REMAKING LABELS ####
    X = X.astype(theano.config.floatX)
    mask = mask.astype(theano.config.floatX)
    # Dummy -> concat
    vals = np.arange(0, 8)
    labels_new = np.zeros((num_seqs, seqlen))
    for i in range(np.size(labels, axis=0)):
        labels_new[i, :] = np.dot(labels[i, :, :], vals)
    labels_new = labels_new.astype('int32')
    labels = labels_new
    print(labels.shape)

    print("Loading splits ...")
    ##### SPLITS #####
    # getting splits (cannot run before splits are made)
    # split = np.load("data/split.pkl")

    seq_names = np.arange(0, num_seqs)
    # np.random.shuffle(seq_names)

    X_train = X[seq_names[0:5278]]
    X_valid = X[seq_names[5278:5534]]
    labels_train = labels[seq_names[0:5278]]
    labels_valid = labels[seq_names[5278:5534]]
    mask_train = mask[seq_names[0:5278]]
    mask_valid = mask[seq_names[5278:5534]]
    num_seq_train = np.size(X_train, 0)
    num_seq_valid = np.size(X_valid, 0)
    return X_train, X_valid, labels_train, labels_valid, mask_train, \
           mask_valid, num_seq_train

In [7]:
TRAIN_PATH = 'secondary_proteins_prediction/data/cullpdb+profile_6133_filtered.npy.gz'
X_train, X_valid, labels_train, labels_valid, mask_train, mask_valid, _ = get_train(TRAIN_PATH)
print("Done loading train")
print(X_train.shape)

Train path is downloaded ...
Loading train data ...
(5534, 700)
Loading splits ...
Done loading train
(5278, 700, 57)


In [8]:
def get_test(path=TEST_PATH):
    if not os.path.isfile(path):
        subprocess.call("./download_test.sh", shell=True)
    print("Loading test data ...")
    X_test_in = load_gz(path)
    X_test = np.reshape(X_test_in, (514, 700, 57))
    del X_test_in
    X_test = X_test[:, :, :].astype(theano.config.floatX)
    labels_test = X_test[:, :, 22:30].astype('int32')
    mask_test = X_test[:, :, 30].astype(theano.config.floatX) * -1 + 1

    a = np.arange(0, 21)
    b = np.arange(35, 56)
    c = np.hstack((a, b))
   #X_test = X_test[:, :, c]

    # getting meta
    seqlen = np.size(X_test, 1)
    d = np.size(X_test, 2)
    num_classes = 8
    num_seq_test = np.size(X_test, 0)
    del a, b, c

    ## DUMMY -> CONCAT ##
    vals = np.arange(0, 8)
    labels_new = np.zeros((num_seq_test, seqlen))
    for i in range(np.size(labels_test, axis=0)):
        labels_new[i, :] = np.dot(labels_test[i, :, :], vals)
    labels_new = labels_new.astype('int32')
    labels_test = labels_new

    ### ADDING BATCH PADDING ###

    X_add = np.zeros((126, seqlen, d))
    label_add = np.zeros((126, seqlen))
    mask_add = np.zeros((126, seqlen))

    X_test = np.concatenate((X_test, X_add), axis=0).astype(theano.config.floatX)
    labels_test = np.concatenate((labels_test, label_add), axis=0).astype('int32')
    mask_test = np.concatenate((mask_test, mask_add), axis=0).astype(theano.config.floatX)
    return X_test, mask_test, labels_test, num_seq_test

In [9]:
TEST_PATH = 'secondary_proteins_prediction/data/cb513+profile_split1.npy.gz'
X_test, mask_test, labels_test, num_seq_test = get_test(TEST_PATH)
print("Done loading test")
print(X_test.shape)

Loading test data ...
Done loading test
(640, 700, 57)


In [10]:
#To do: Make 1 hot encoded class - from Q8 to Q3
       #Reshape the dataset so it has 700*windowSize seq that map to a class
       #Look inside lasagne to see how we disregard the padding
##

def q8ClassToQ3(q8Labels):
    index = -1
    q3 = np.zeros(3)
    for i in range(np.size(q8Labels)):
        if q8Labels[i] == 1:
            index = i
    #Helix
    if index == 5 or index == 3 or index == 4 : # H ,G, I
        q3[0] = 1   
    #beta    
    if index == 1 or index == 2: # B, E
        q3[1] = 1    
    #coil    
    if index == 7 or index == 6 or index == 0 : # T, S, L
        q3[2] = 1
    return q3

def q8ClassToQ2(q8Labels):
    index = -1
    q2 = np.zeros(2)
    for i in range(np.size(q8Labels)):
        if q8Labels[i] == 1:
            index = i
    #Helix
    if index == 5 or index == 3 or index == 4 : # H ,G, I
        q2[0] = 1
    else:
        q2[1] = 1  
    return q2

def changeQ8Class(dataSet, reductionFunction, numberOfFeatures):

    num_seqs = np.size(dataSet, 0)
    seqlen = np.size(dataSet, 1)
    labels_new = np.zeros((num_seqs, seqlen, numberOfFeatures))

    for i in range(np.size(dataSet, axis=0)):
        for j in range(np.size(dataSet, axis=1)):
            q3OneHot = reductionFunction(dataSet[i, j, 22:30])
            features = np.concatenate((dataSet[i, j, 0:21], q3OneHot), axis=None)
            features = np.concatenate((features, dataSet[i, j, 35:56]), axis=None)
            labels_new[i][j] = features
    return labels_new


In [11]:
def swapClassLabel(features, classLabel, classSize):

    res = np.concatenate((features[0:21], classLabel), axis=None)
    res = np.concatenate((res, features[ (21+classSize) :]), axis=None)
  
    return res

In [12]:
import math

#num_classes should be q8, q3 or maybe q2?
def proteinSequenceToWindowSequence(windowSize, predictionIndex , dataSet, classSize):
   
    num_seqs = np.size(dataSet, 0)
    seqlen = np.size(dataSet, 1)
    features = np.size(dataSet, 2)
    dataSet_new = np.zeros((num_seqs, seqlen - windowSize + 1, windowSize, features))
    
    for i in range(np.size(dataSet, axis=0)):
        for j in range(np.size(dataSet, axis=1) - windowSize + 1):
            classLabel = dataSet[i][j + predictionIndex][21 : (21+classSize) ]
            for k in range(windowSize):
                dataSet_new[i][j][k] = swapClassLabel(dataSet[i][j+k], classLabel, classSize)
            
    return dataSet_new    

In [13]:
def removeWindowsWithPadding(dataSet, windowSize, numberOfFeatures):
    
    dataSet = np.reshape(dataSet, (dataSet.shape[0]*dataSet.shape[1], windowSize, numberOfFeatures))
    dataSet = dataSet[np.count_nonzero( dataSet, axis=(1,2))>(int(windowSize/2)*numberOfFeatures), :, :] 
    
    return dataSet

In [14]:
windowSize = 7
predictionIndex = 3
classSize = 2  # 2 or 3 
numberOfFeatures = 44

In [15]:
print(X_train.shape)
X_train = changeQ8Class(X_train, q8ClassToQ2, numberOfFeatures)
print(X_train.shape, "changed train data to class of size ", classSize)
X_train_window = proteinSequenceToWindowSequence(windowSize,predictionIndex, X_train, classSize)
print(X_train_window.shape, "changed train data  to window sequence of size ", windowSize)
X_train_window = removeWindowsWithPadding(X_train_window , windowSize, numberOfFeatures)
print(X_train_window.shape, "filtered windows withouth padding of train data ")

print(X_valid.shape)
X_valid = changeQ8Class(X_valid, q8ClassToQ2, numberOfFeatures)
print(X_valid.shape, "changed validation data to class size ", classSize)
X_valid_window = proteinSequenceToWindowSequence(windowSize,predictionIndex, X_valid, classSize)
print(X_valid_window.shape, "changed validation data to window sequence of size ", windowSize)
X_valid_window = removeWindowsWithPadding(X_valid_window , windowSize, numberOfFeatures)
print(X_valid_window.shape, "filtered windows withouth padding of validation data")





(5278, 700, 57)
(5278, 700, 44) changed train data to class of size  2
(5278, 694, 7, 44) changed train data  to window sequence of size  7
(1103472, 7, 44) filtered windows withouth padding of train data 
(256, 700, 57)
(256, 700, 44) changed validation data to class size  2
(256, 694, 7, 44) changed validation data to window sequence of size  7
(52176, 7, 44) filtered windows withouth padding of validation data


In [16]:
sequence_len = 700
amino_acid_residues = 21
num_classes = 2

In [17]:

x_train_final = X_train_window[:,:,0:21]
y_train_final = X_train_window[:,:,21 : (21+classSize)]
print(x_train_final.shape, "training data")
print(y_train_final.shape, "labels for training data")

x_valid_final = X_valid_window[:,:,0:21]
y_valid_final = X_valid_window[:,:,21 : (21+classSize)]
print(x_valid_final.shape, "validation data")
print(y_valid_final.shape, "labels for training validation")


(1103472, 7, 21) training data
(1103472, 7, 2) labels for training data
(52176, 7, 21) validation data
(52176, 7, 2) labels for training validation


In [18]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

Using TensorFlow backend.
W0128 19:31:42.219681 12732 deprecation_wrapper.py:119] From C:\Apps\Anaconda3\envs\gpu-cuda10\lib\site-packages\keras\backend\tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0128 19:31:42.219681 12732 deprecation_wrapper.py:119] From C:\Apps\Anaconda3\envs\gpu-cuda10\lib\site-packages\keras\backend\tensorflow_backend.py:181: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.

W0128 19:31:42.220681 12732 deprecation_wrapper.py:119] From C:\Apps\Anaconda3\envs\gpu-cuda10\lib\site-packages\keras\backend\tensorflow_backend.py:186: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.

W0128 19:31:44.429045 12732 deprecation_wrapper.py:119] From C:\Apps\Anaconda3\envs\gpu-cuda10\lib\site-packages\keras\backend\tensorflow_backend.py:190: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.



['/job:localhost/replica:0/task:0/device:GPU:0']

In [19]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Conv1D, AveragePooling1D, MaxPooling1D, TimeDistributed, LeakyReLU, BatchNormalization, Flatten
from keras import optimizers, callbacks
from keras.regularizers import l2

LR = 0.0005
drop_out = 0.3
batch_dim = 64
nn_epochs = 20

loss = 'categorical_crossentropy'


m = Sequential()
m.add(Conv1D(128, 11, padding='same', activation='relu', input_shape=(windowSize, amino_acid_residues)))
m.add(Dropout(drop_out))
m.add(Conv1D(64, 11, padding='same', activation='relu'))
m.add(Dropout(drop_out))
m.add(Conv1D(num_classes, 11, padding='same', activation='softmax'))
opt = optimizers.Adam(lr=LR)
m.compile(optimizer=opt, loss=loss,metrics=['accuracy', 'mae'])

print("\nHyper Parameters\n")
print("Learning Rate: " + str(LR))
print("Drop out: " + str(drop_out))
print("Batch dim: " + str(batch_dim))
print("Number of epochs: " + str(nn_epochs))
print("\nLoss: " + loss + "\n")
m.summary()

W0128 19:31:49.362160 12732 deprecation_wrapper.py:119] From C:\Apps\Anaconda3\envs\gpu-cuda10\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0128 19:31:49.389091 12732 deprecation.py:506] From C:\Apps\Anaconda3\envs\gpu-cuda10\lib\site-packages\keras\backend\tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
W0128 19:31:49.460906 12732 deprecation_wrapper.py:119] From C:\Apps\Anaconda3\envs\gpu-cuda10\lib\site-packages\keras\optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.




Hyper Parameters

Learning Rate: 0.0005
Drop out: 0.3
Batch dim: 64
Number of epochs: 20

Loss: categorical_crossentropy

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 7, 128)            29696     
_________________________________________________________________
dropout_1 (Dropout)          (None, 7, 128)            0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 7, 64)             90176     
_________________________________________________________________
dropout_2 (Dropout)          (None, 7, 64)             0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 7, 2)              1410      
Total params: 121,282
Trainable params: 121,282
Non-trainable params: 0
_________________________________________________________________


In [None]:
from time import time
from timeit import default_timer as timer

start_time = timer()
history = m.fit(x_train_final, y_train_final, epochs=nn_epochs, batch_size=batch_dim, validation_data=(x_valid_final, y_valid_final) ,shuffle=True)

end_time = timer()
print("\n\nTime elapsed: " + "{0:.2f}".format((end_time - start_time)) + " s")

W0128 19:32:19.397916 12732 deprecation.py:323] From C:\Apps\Anaconda3\envs\gpu-cuda10\lib\site-packages\tensorflow\python\ops\math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 1103472 samples, validate on 52176 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20

In [17]:
import matplotlib.pyplot as plt

# Plot training & validation accuracy values
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.savefig('accuracyRight.png')
plt.show()


# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.savefig('lossRight.png')
plt.show()


ModuleNotFoundError: No module named 'matplotlib'

In [18]:
import pickle

pickle_out = open("windowSize7RightHistory.pickle","wb")
pickle.dump(history, pickle_out)
pickle_out.close()

pkl_filename = "windowSize7RightModel.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(m, file)

NameError: name 'history' is not defined

In [20]:
X_test = q8ClassLabelsToQ3(X_test)
X_test_window = proteinSequenceToWindowSequence(windowSize,predictionIndex, X_test)
X_test_window = removeDuplicatesWindowDataSet(X_test_window , windowSize)
print(X_train_window.shape)

x_test_final = X_test_window[:,:,0:21]
y_test_final = X_test_window[:,:,21:24]
print(x_test_final.shape)
print(y_test_final.shape)

NameError: name 'q8ClassLabelsToQ3' is not defined

In [None]:
scores = m.evaluate(x_test_final, y_test_final)
print("Loss: " + str(scores[0]) + ", Accuracy: " + str(scores[1]) + ", MAE: " + str(scores[2]))