In [1]:
import numpy as np
import pandas as pd
np.random.seed(0)
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D
from keras.utils.vis_utils import model_to_dot
from keras.optimizers import RMSprop
from keras.callbacks import ReduceLROnPlateau
from IPython.display import SVG
random_seed = 0

Using TensorFlow backend.
  return f(*args, **kwds)


## Load the data

In [2]:
dataDir = './../data/'
fileName = 'ExampleRun'
fastaMatrix = np.load(dataDir+'np/'+fileName+'_fastaMatrices.npy')
topologyMatrix = np.load(dataDir+'np/'+fileName+'_topologyMatrices.npy')
distanceMatrix = np.load(dataDir+'np/'+fileName+'_distanceMatrices.npy')

## Basic test train splits

In [3]:
# Test train split Topology.
x_train, x_validation, y_train_topology, y_validation_topology = train_test_split(fastaMatrix, topologyMatrix, test_size = 0.01, random_state=random_seed)
# Hold back some data completely
x_train, x_test, y_train_topology, y_test_topology = train_test_split(x_train, y_train_topology, test_size = 0.1, random_state=random_seed)

# Test train split Distance.
x_train, x_validation, y_train_distance, y_validation_distance = train_test_split(fastaMatrix, distanceMatrix, test_size = 0.01, random_state=random_seed)
# Hold back some data completely
x_train, x_test, y_train_distance, y_test_distance = train_test_split(x_train, y_train_distance, test_size = 0.1, random_state=random_seed)

## Model Definition

In [4]:
x_train.shape[2]

20

In [5]:
# Go with simple model to start: Input --> Conv2D; MaxPool2D; Flatten; Dense --> Out
modelTopo = Sequential()

modelTopo.add(Conv2D(filters = 30, kernel_size = (5,5),padding = 'Same', activation ='relu', input_shape = (x_train.shape[1:4]), data_format="channels_last"))
modelTopo.add(MaxPool2D(pool_size=(2,2)))
modelTopo.add(Flatten())
modelTopo.add(Dense((x_train.shape[1]*x_train.shape[2]), activation = "relu"))
modelTopo.add(Dense(y_train_topology.shape[1], activation = "relu"))

modelDist = Sequential()

modelDist.add(Conv2D(filters = 30, kernel_size = (5,5),padding = 'Same', activation ='relu', input_shape = (x_train.shape[1:4]), data_format="channels_last"))
modelDist.add(MaxPool2D(pool_size=(2,2)))
modelDist.add(Flatten())
modelDist.add(Dense((x_train.shape[1]*x_train.shape[2]), activation = "relu"))
modelDist.add(Dense(y_train_topology.shape[1], activation = "relu"))

In [6]:
# Optimizer.
optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)

# Compile.
modelTopo.compile(optimizer = optimizer , loss = "mean_squared_error", metrics=["accuracy"])
modelDist.compile(optimizer = optimizer , loss = "mean_squared_error", metrics=["accuracy"])

# Set a learning rate annealer.
learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', 
                                            patience=3, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.00001)

batch_size = 86

In [7]:
print("fitting the model on topology")
history = modelTopo.fit(x_train, y_train_topology, batch_size=batch_size, epochs=50, 
          validation_data = (x_test, y_test_topology), verbose = 2)

fitting the model on topology
Train on 891 samples, validate on 99 samples
Epoch 1/50
 - 0s - loss: 4.8267 - acc: 0.0595 - val_loss: 2.9194 - val_acc: 0.1212
Epoch 2/50
 - 0s - loss: 2.9053 - acc: 0.0662 - val_loss: 2.8969 - val_acc: 0.2626
Epoch 3/50
 - 0s - loss: 2.9308 - acc: 0.0707 - val_loss: 2.9580 - val_acc: 0.0707
Epoch 4/50
 - 0s - loss: 2.8835 - acc: 0.0606 - val_loss: 2.8630 - val_acc: 0.0808
Epoch 5/50
 - 0s - loss: 2.9296 - acc: 0.0741 - val_loss: 2.9256 - val_acc: 0.0000e+00
Epoch 6/50
 - 0s - loss: 2.8761 - acc: 0.0359 - val_loss: 2.9177 - val_acc: 0.0808
Epoch 7/50
 - 0s - loss: 2.8903 - acc: 0.0730 - val_loss: 2.8873 - val_acc: 0.0202
Epoch 8/50
 - 0s - loss: 2.8070 - acc: 0.0707 - val_loss: 2.8473 - val_acc: 0.0606
Epoch 9/50
 - 0s - loss: 2.8770 - acc: 0.0786 - val_loss: 2.8946 - val_acc: 0.0202
Epoch 10/50
 - 0s - loss: 2.8356 - acc: 0.0673 - val_loss: 2.9933 - val_acc: 0.0000e+00
Epoch 11/50
 - 0s - loss: 2.8022 - acc: 0.1044 - val_loss: 2.9334 - val_acc: 0.0303
Ep

In [8]:
print("fitting the model on distances")
history = modelDist.fit(x_train, y_train_topology, batch_size=batch_size, epochs=50, 
          validation_data = (x_test, y_test_topology), verbose = 2)

fitting the model on distances
Train on 891 samples, validate on 99 samples
Epoch 1/50
 - 0s - loss: 4.2566 - acc: 0.0247 - val_loss: 2.8321 - val_acc: 0.0707
Epoch 2/50
 - 0s - loss: 2.9315 - acc: 0.0651 - val_loss: 2.7968 - val_acc: 0.0303
Epoch 3/50
 - 0s - loss: 2.9162 - acc: 0.0730 - val_loss: 2.8320 - val_acc: 0.1010
Epoch 4/50
 - 0s - loss: 2.9238 - acc: 0.0730 - val_loss: 2.7668 - val_acc: 0.0303
Epoch 5/50
 - 0s - loss: 2.8859 - acc: 0.0426 - val_loss: 2.7378 - val_acc: 0.0202
Epoch 6/50
 - 0s - loss: 2.9044 - acc: 0.0696 - val_loss: 2.7704 - val_acc: 0.0606
Epoch 7/50
 - 0s - loss: 2.8712 - acc: 0.0629 - val_loss: 2.8165 - val_acc: 0.0303
Epoch 8/50
 - 0s - loss: 2.8742 - acc: 0.0640 - val_loss: 2.7613 - val_acc: 0.0303
Epoch 9/50
 - 0s - loss: 2.8316 - acc: 0.0842 - val_loss: 2.8741 - val_acc: 0.1313
Epoch 10/50
 - 0s - loss: 2.8642 - acc: 0.0920 - val_loss: 2.8526 - val_acc: 0.0808
Epoch 11/50
 - 0s - loss: 2.8122 - acc: 0.0707 - val_loss: 2.7431 - val_acc: 0.0303
Epoch 12/

In [9]:
# Serialize models to JSON
modelJsonTopo = modelTopo.to_json()
with open(dataDir+"keras/modelTopo.json", "w") as json_file:
    json_file.write(modelJsonTopo)
    
modelJsonDist = modelDist.to_json()
with open("data/keras/modelDist.json", "w") as json_file:
    json_file.write(modelJsonDist)

FileNotFoundError: [Errno 2] No such file or directory: 'data/keras/modelTopo.json'

## Make Predictions on the Test Data

In [None]:
distPredictions = []
topoPredictions = []
for x in x_test:
    expandedX = np.expand_dims(x, axis=0)
    topoPrediction = modelTopo.predict(expandedX)
    topoPredictions.append(topoPrediction)
    distPrediction = modelDist.predict(expandedX)
    distPredictions.append(distPrediction)

topoPredictionsNP = np.concatenate(tuple(topoPredictions))
distPredictionsNP = np.concatenate(tuple(distPredictions))

In [None]:
topoPredictionsNP.shape

In [None]:
np.save('data/np/x_test0', x_test[0])