Loading the various libraries and modules needed

In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras import regularizers

import sklearn as sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef

Defining a function for loading the data as numpy arrays with each position one-hot-encoded

In [2]:
def loadOneHotEncoded(filename, size):
    inputFile = open(filename,'r')
    df = np.array([])
    for i in range(1, size, 1):
        content = inputFile.readline()
        if i == 5:
            df = np.array(list(content))
            length = len(df)
            df = df.reshape(length)[length-141:length-1]
            df = pd.get_dummies(df).values
            df = df.reshape(1,140,4)
        if i>5:
            contentArray = np.array(list(content))
            length = len(contentArray)
            contentArray = contentArray.reshape(length)[length-141:length-1]
            contentArray = np.append(contentArray, ['A'], axis = 0)
            contentArray = np.append(contentArray, ['C'], axis = 0)
            contentArray = np.append(contentArray, ['G'], axis = 0)
            contentArray = np.append(contentArray, ['T'], axis = 0)
            contentArray = pd.get_dummies(contentArray).values
            contentArray = contentArray[0:140].reshape(1,140,4)
            df = np.append(df,contentArray, axis=0)
    return(df)

Defining a function for instantiation of the various arrays, encoded as one-hot sequences (Numbers indicate sample counts, offset by the 5 lines of header)

In [3]:
def loadSequenceArrays():
    ei_true = loadOneHotEncoded('EI_true.seq',2796+5)
    ie_true = loadOneHotEncoded('IE_true.seq',2880+5)
    ei_false = loadOneHotEncoded('EI_false.seq', 2796+5)
    ie_false = loadOneHotEncoded('IE_false.seq', 2880+5)
    return(ei_true, ie_true, ei_false, ie_false)

Defining a function for creating target arrays, one for each category.

In [4]:
def createTargetArray(eiTrueArray, ieTrueArray, eiFalseArray, ieFalseArray):
    eiTarget = np.array([1,0]).reshape(1,2)
    for i in range(eiTrueArray.shape[0]-1):
        eiTarget = np.append(eiTarget, [[1,0]], axis = 0)
    for i in range(eiFalseArray.shape[0]):
        eiTarget = np.append(eiTarget, [[0,1]], axis = 0)
    ieTarget = np.array([1,0]).reshape(1,2)
    for i in range(ieTrueArray.shape[0]-1):
        ieTarget = np.append(ieTarget, [[1,0]], axis = 0)
    for i in range(ieFalseArray.shape[0]):
        ieTarget = np.append(ieTarget, [[0,1]], axis = 0)
    return(eiTarget, ieTarget)

Defining a function for creating a complete predictor array with all sequences, one for each category

In [5]:
def createPredictorArray(eiTrueArray, ieTrueArray, eiFalseArray, ieFalseArray):
    eiPredictors = eiTrueArray
    eiPredictors = np.append(eiPredictors, eiFalseArray,axis=0)
    iePredictors = ieTrueArray
    iePredictors = np.append(iePredictors, ieFalseArray,axis=0)
    return(eiPredictors, iePredictors)

Defining a function for wrapping all the previous functions together to create the final predictors and target arrays

In [6]:
def createPredictorsAndTarget():
    eiTrueArray, ieTrueArray, eiFalseArray, ieFalseArray = loadSequenceArrays()
    eiTarget, ieTarget = createTargetArray(eiTrueArray, ieTrueArray, eiFalseArray, ieFalseArray)
    eiPredictors, iePredictors = createPredictorArray(eiTrueArray, ieTrueArray, eiFalseArray, ieFalseArray)
    return(eiTarget, ieTarget, eiPredictors, iePredictors)

Use the createPredictorsAndTarget and SKLearn's train_test_split functions to produce the TRAINING and TEST data partitions, whereafter they are saved to reloadable files (run once).

In [7]:
#eiTarget, ieTarget, eiPredictors, iePredictors = createPredictorsAndTarget()
#eiPredictorsTrain, eiPredictorsTest, eiTargetTrain, eiTargetTest = train_test_split(eiPredictors,eiTarget, test_size = 0.2, stratify = eiTarget)
#iePredictorsTrain, iePredictorsTest, ieTargetTrain, ieTargetTest = train_test_split(iePredictors,ieTarget, test_size = 0.2, stratify = ieTarget)
#np.save("eiPredictorsTrain", eiPredictorsTrain)
#np.save("eiPredictorsTest", eiPredictorsTest)
#np.save("eiTargetTrain", eiTargetTrain)
#np.save("eiTargetTest", eiTargetTest)
#np.save("iePredictorsTrain", iePredictorsTrain)
#np.save("iePredictorsTest", iePredictorsTest)
#np.save("ieTargetTrain", ieTargetTrain)
#np.save("ieTargetTest", ieTargetTest)

Load all the previously produced data partitions

In [8]:
eiPredictorsTrain = np.load("eiPredictorsTrain.npy")
eiPredictorsTest = np.load("eiPredictorsTest.npy")
eiTargetTrain = np.load("eiTargetTrain.npy")
eiTargetTest = np.load("eiTargetTest.npy")
iePredictorsTrain = np.load("iePredictorsTrain.npy")
iePredictorsTest = np.load("iePredictorsTest.npy")
ieTargetTrain = np.load("ieTargetTrain.npy")
ieTargetTest = np.load("ieTargetTest.npy")

Defining a function for instantiation of the NN model and execution of the compilation step.

In [9]:
def buildModelAndCompile():  
    K.clear_session()
    model = Sequential()  
    model.add(Flatten(input_shape=(140, 4)))
    model.add(Dropout(0.20))  
    model.add(Dense(150, activation = 'relu'))
    model.add(Dropout(0.50))  
    model.add(Dense(100, activation = 'relu'))
    model.add(Dropout(0.50))  
    model.add(Dense(50, activation = 'relu'))
    model.add(Dropout(0.50))  
    model.add(Dense(2, activation = 'softmax'))
    model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])  
    model.summary()
    return(model)

Defining a function that use SKlearn's strafified KFold indexer to do 10-fold cross-validation, outputting accuracy, MCC and a confusion matrix.

In [10]:
def runKFoldCV(predictors, target):
    cvScores = []
    epochNumber = 40
    kfold = StratifiedKFold(n_splits=10)
    for train, test in kfold.split(predictors, target[:,0]): 
        testModel = buildModelAndCompile()
        testModel.fit(predictors[train], target[train], validation_data = (predictors[test], target[test]), epochs = epochNumber, verbose = 1)
        scores = testModel.evaluate(predictors[test], target[test])
        predictions = np.round(testModel.predict(predictors[test])).astype(int)
        MCC = matthews_corrcoef(target[test][:,1], predictions[:,1])
        print("%s: %.2f%%" % (testModel.metrics_names[1], scores[1]*100))
        print("%s: %.2f" % ("MCC", MCC))
        print("Confusion Matrix")
        print(confusion_matrix(target[test][:,1], predictions[:,1]))
        cvScores.append([scores[1] * 100, MCC])
    cvScores = np.asarray(cvScores)
    finalScore = np.asarray([[np.mean(cvScores[:,0]), np.std(cvScores[:,0])], [np.mean(cvScores[:,1]), np.std(cvScores[:,1])]])
    print("Accuracy: %.2f%% (+/- %.2f%%)" % (finalScore[0,0], finalScore[0,1]))
    print("MCC: %.2f (+/- %.2f)" % (finalScore[1,0], finalScore[1,1]))
    model = buildModelAndCompile()
    model.fit(predictors, target, epochs = epochNumber, verbose = 0)
    return(finalScore, model)

Defining a function for building, compiling, and evaluating a single NN model either with 10-fold CV or using a simple validation split.

In [11]:
def buildCompileFitEvaluate(predictors, target, cv):
    if crossValidation == False:
        scores = []
        model = buildModelAndCompile()
        model.fit(predictors, target, validation_split = 0.2, shuffle=True, epochs = 40)
        evaluation = model.evaluate(predictors, target)
        predictions = np.round(model.predict(predictors)).astype(int)
        MCC = matthews_corrcoef(target[:,1], predictions[:,1])
        scores.append([evaluation[1] * 100])
        scores.append([MCC])
        scores = np.asarray(scores)
        print("Accuracy: %.2f%%" % scores[0])
        print("MCC: %.2f" % scores[1])
        print("Confusion Matrix")
        print(confusion_matrix(target[:,1], predictions[:,1]))
        return model, scores
    else:    
        cvScores, model = runKFoldCV(predictors, target)
        return model, cvScores

Setting the global parameters and proceed according to the settings. Will run the NN model on both the donor and acceptor category training data, and if the finalRun flag is set to True then the test set will be used for test error evaluation, while also saving the model.

In [12]:
finalRun = False
crossValidation = False
score = []

if finalRun == False:
    eiModel, eiScore = buildCompileFitEvaluate(eiPredictorsTrain, eiTargetTrain, crossValidation)
    ieModel, ieScore = buildCompileFitEvaluate(iePredictorsTrain, ieTargetTrain, crossValidation)
else:
    eiModel, eiScore = buildCompileFitEvaluate(eiPredictorsTrain, eiTargetTrain, crossValidation)
    eiModel.save('Model-6ei-basicdeep.h5')
    eiTestPredictions = np.round(eiModel.predict(eiPredictorsTest)).astype(int)
    eiTestMCC = matthews_corrcoef(eiTargetTest[:,0], eiTestPredictions[:,0])
    eiMatrix = confusion_matrix(eiTargetTest[:,0], eiTestPredictions[:,0])
    
    ieModel, ieScore = buildCompileFitEvaluate(iePredictorsTrain, ieTargetTrain, crossValidation)
    ieModel.save('Model-6ie-basicdeep.h5')
    ieTestPredictions = np.round(ieModel.predict(iePredictorsTest)).astype(int)
    ieTestMCC = matthews_corrcoef(ieTargetTest[:,0], ieTestPredictions[:,0])
    ieMatrix = confusion_matrix(ieTargetTest[:,0], ieTestPredictions[:,0])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 560)               0         
_________________________________________________________________
dropout (Dropout)            (None, 560)               0         
_________________________________________________________________
dense (Dense)                (None, 150)               84150     
_________________________________________________________________
dropout_1 (Dropout)          (None, 150)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               15100     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 50)                5050      
__________

Train on 3686 samples, validate on 922 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Accuracy: 97.79%
MCC: 0.96
Confusion Matrix
[[2274   30]
 [  72 2232]]
