In [None]:
!pip install scikit-optimize
!pip install barplots
!pip install extra_keras_metrics
!pip install epigenomic_dataset
!pip install ucsc_genomes_downloader
!pip install keras_bed_sequence
!pip install seaborn

Collecting minepy
  Downloading minepy-1.2.5.tar.gz (495 kB)
[?25l[K     |▋                               | 10 kB 27.6 MB/s eta 0:00:01[K     |█▎                              | 20 kB 20.5 MB/s eta 0:00:01[K     |██                              | 30 kB 10.8 MB/s eta 0:00:01[K     |██▋                             | 40 kB 8.7 MB/s eta 0:00:01[K     |███▎                            | 51 kB 5.0 MB/s eta 0:00:01[K     |████                            | 61 kB 5.6 MB/s eta 0:00:01[K     |████▋                           | 71 kB 5.6 MB/s eta 0:00:01[K     |█████▎                          | 81 kB 6.3 MB/s eta 0:00:01[K     |██████                          | 92 kB 4.9 MB/s eta 0:00:01[K     |██████▋                         | 102 kB 5.3 MB/s eta 0:00:01[K     |███████▎                        | 112 kB 5.3 MB/s eta 0:00:01[K     |████████                        | 122 kB 5.3 MB/s eta 0:00:01[K     |████████▋                       | 133 kB 5.3 MB/s eta 0:00:01[K     |██████

In [None]:
##import silence_tensorflow.auto
from epigenomic_dataset import load_epigenomes
from epigenomic_dataset import active_promoters_vs_inactive_promoters
from sklearn.impute import KNNImputer
from sklearn.preprocessing import RobustScaler
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from cache_decorator import Cache
from tqdm.keras import TqdmCallback
from barplots import barplots
from ucsc_genomes_downloader import Genome
from keras_bed_sequence import BedSequence
from keras_mixed_sequence import MixedSequence, VectorSequence
import matplotlib.pyplot as plt # A standard plotting library

## Data retrieval
First, we retrieve the data and impute and scale them.

In [None]:
genome = Genome("hg38")

Downloading chromosomes for genome hg38:   0%|          | 0/25 [00:00<?, ?it/s]

Loading chromosomes for genome hg38:   0%|          | 0/25 [00:00<?, ?it/s]

In [None]:
cell_line = "H1"

X, y = active_promoters_vs_inactive_promoters(
    cell_line=cell_line,
    binarize=True
)

X = X.reset_index()
bed = X[X.columns[:5]]

Downloading to datasets/fantom/...omoters/H1.csv.xz:   0%|          | 0.00/11.7M [00:00<?, ?iB/s]

Downloading to datasets/fantom/.../promoters.bed.xz:   0%|          | 0.00/1.22M [00:00<?, ?iB/s]

In [None]:
def build_sequence(
    X: pd.DataFrame,
    y: np.ndarray,
    genome: Genome,
    batch_size: int
) -> MixedSequence:
    return MixedSequence(
        x=BedSequence(
            genome,
            X,
            batch_size=batch_size,
        ),
        y=VectorSequence(
            y,
            batch_size=batch_size
        )
    )

In [None]:
mixed_sequence = build_sequence(bed, y[cell_line].values, genome, 1024)
inputs, outputs = list(zip(*mixed_sequence))
inputs = np.vstack(inputs)
outputs = np.hstack(outputs)
inputs = inputs.reshape(-1, 256*4)



In [None]:
from sklearn.decomposition import FactorAnalysis

def mfa(x:np.ndarray, n_components:int=2)->np.ndarray:
    return FactorAnalysis(n_components=n_components, random_state=42).fit_transform(x)


In [None]:
from sklearn.decomposition import FactorAnalysis
colors = np.array([
    "tab:blue",
    "tab:orange",
])

train_sequence = build_sequence(bed, y.values, genome, batch_size=len(bed))
X = train_sequence.rasterize()[0].reshape(-1, 256*4).astype(int)

xseries = []
yseries = []
for example in mfa(X):
    xseries.append(example[0])
    yseries.append(example[1])

tcolors = []
for color in colors[y.values.astype(int)]:
    tcolors.append(color[0])

fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(10, 5))
axes.scatter(xseries, yseries, s=1, c=tcolors)
axes.set_title(f"MFA decomposition of DNA sequences")
plt.show()

## Bayesian Optimization for Model Selection

In [None]:
import skopt
from skopt import gp_minimize, forest_minimize
from skopt.space import Real, Categorical, Integer
from skopt.plots import plot_convergence
from skopt.plots import plot_objective, plot_evaluations
from skopt.plots import plot_objective
from skopt.utils import use_named_args

In [None]:
#Definition of the dimensions in which we want to find the best hyper parameter's values for the task.
dim_num_conv_units_2 = Categorical(categories=[32,64,128], name=
                                    'num_conv_units_2')
dim_num_dense_nodes_1 = Categorical(categories=[16,32,64], name=
                                    'num_dense_nodes_1')
dim_num_dense_nodes_2 = Categorical(categories=[16,32,64], name=
                                    'num_dense_nodes_2')
dim_kernel_size_2 = Categorical(categories=[5, 10], 
                               name='kernel_size_2')
dim_dropout_prob = Categorical(categories=[0.1, 0.2, 0.3, 0.4, 0.5], 
                               name='dropout_prob')
    
dimensions = [dim_num_conv_units_2,
              dim_num_dense_nodes_2,
              dim_num_dense_nodes_1,
              dim_kernel_size_2,
              dim_dropout_prob]

In [None]:
from tensorflow.keras.layers import Dense, Input, Conv1D, Conv2D, Reshape, Flatten, MaxPool1D, MaxPool2D, Dropout, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.metrics import AUC, Accuracy
from tensorflow.keras.optimizers import Nadam

def create_model(    
    num_conv_units_2,           
    num_dense_nodes_2,
    num_dense_nodes_1,
    kernel_size_2,
    dropout_prob):
    
    model = Sequential()

    model.add(Input((256, 4)))
    model.add(Conv1D(64, kernel_size=5, activation="relu", padding="same"))
    model.add(BatchNormalization())
    model.add(Conv1D(64, kernel_size=5, activation="relu", padding="same"))
    model.add(BatchNormalization())
    model.add(Conv1D(64, kernel_size=5, activation="relu", padding="same"))
    model.add(BatchNormalization())
    model.add(MaxPool1D())
    model.add(Conv1D(num_conv_units_2, kernel_size=int(kernel_size_2), activation="relu", padding="same"))
    model.add(BatchNormalization())
    model.add(MaxPool1D())
    model.add(Flatten())
    model.add(Dense(num_dense_nodes_2, activation="relu"))
    model.add(Dropout(dropout_prob))
    model.add(Dense(num_dense_nodes_1, activation="relu"))
    model.add(Dropout(dropout_prob))
    model.add(Dense(1, activation="sigmoid"))

    model.compile(
        loss="binary_crossentropy",
        optimizer=Nadam(0.0002),
        metrics=['accuracy', AUC(curve='ROC', name='AUROC'),  AUC(curve='PR', name='AUPRC')]
    )

    return model

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

number_of_splits = 1
holdouts_generator = StratifiedShuffleSplit(
    n_splits=number_of_splits,
    test_size=0.2,
    random_state = 42
)

for holdout_number, (train_indices, test_indices) in enumerate(holdouts_generator.split(bed, y)):
    X_train, X_test = bed.iloc[train_indices], bed.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

train_sequence = build_sequence(X_train, y_train.values, genome, batch_size=128)
test_sequence = build_sequence(X_test, y_test.values, genome, batch_size=128)

In [None]:
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import EarlyStopping

@use_named_args(dimensions=dimensions)
def fitness(num_conv_units_2, num_dense_nodes_2, num_dense_nodes_1, kernel_size_2, dropout_prob):

    # Print the hyper-parameters.
    print('num_conv_units_2:', num_conv_units_2)
    print('num_dense_nodes_2:', num_dense_nodes_2)
    print('num_dense_nodes_1:', num_dense_nodes_1)
    print('kernel_size_2:', kernel_size_2)
    print('dropout_prob:', dropout_prob)
    
    # Create the neural network with these hyper-parameters.
    model = create_model(num_conv_units_2=num_conv_units_2,
                         num_dense_nodes_2=num_dense_nodes_2,
                         num_dense_nodes_1=num_dense_nodes_1,
                         kernel_size_2=kernel_size_2,
                         dropout_prob = dropout_prob)

    # Training phase
    history = model.fit(train_sequence,
                        epochs=100,
                        validation_data=test_sequence,
                        verbose=2,
                        callbacks = [EarlyStopping(monitor = "val_loss", patience = 3)])

    # I use this metric because I want to put more importance to the positive examples
    val_loss = history.history['val_loss'][-1]

    # Print the result of the AUPRC metric over the validation set.
    print()
    print("val_loss: {0:.2%}".format(val_loss))
    print()

    del model
    
    K.clear_session()
    
    # Scikit-optimize does minimization so it tries to
    # find a set of hyper-parameters with the LOWEST fitness-value.
    # We need to negate the result so it can be minimized.
    return val_loss

In [None]:
%%time

default_parameters = [64, 64, 64, 10, 0.1]
search_result = gp_minimize(func=fitness,
                            dimensions=dimensions,
                            n_calls=40,
                            x0=default_parameters)

num_conv_units_2: 64
num_dense_nodes_2: 64
num_dense_nodes_1: 64
kernel_size_2: 10
dropout_prob: 0.1
Epoch 1/100
625/625 - 57s - loss: 0.5221 - accuracy: 0.7391 - AUROC: 0.7026 - AUPRC: 0.4058 - val_loss: 0.5387 - val_accuracy: 0.7297 - val_AUROC: 0.7327 - val_AUPRC: 0.4303
Epoch 2/100
625/625 - 23s - loss: 0.4744 - accuracy: 0.7614 - AUROC: 0.7765 - AUPRC: 0.5132 - val_loss: 0.4841 - val_accuracy: 0.7575 - val_AUROC: 0.7662 - val_AUPRC: 0.4968
Epoch 3/100
625/625 - 23s - loss: 0.4420 - accuracy: 0.7874 - AUROC: 0.8139 - AUPRC: 0.5969 - val_loss: 0.4885 - val_accuracy: 0.7605 - val_AUROC: 0.7638 - val_AUPRC: 0.4979
Epoch 4/100
625/625 - 23s - loss: 0.4015 - accuracy: 0.8151 - AUROC: 0.8524 - AUPRC: 0.6778 - val_loss: 0.5122 - val_accuracy: 0.7504 - val_AUROC: 0.7446 - val_AUPRC: 0.4836
Epoch 5/100
625/625 - 23s - loss: 0.3452 - accuracy: 0.8471 - AUROC: 0.8953 - AUPRC: 0.7684 - val_loss: 0.5496 - val_accuracy: 0.7557 - val_AUROC: 0.7479 - val_AUPRC: 0.4805

val_loss: 54.96%

num_conv_u



num_conv_units_2: 32
num_dense_nodes_2: 16
num_dense_nodes_1: 64
kernel_size_2: 10
dropout_prob: 0.5
Epoch 1/100
625/625 - 27s - loss: 0.5892 - accuracy: 0.7171 - AUROC: 0.6001 - AUPRC: 0.2994 - val_loss: 0.5173 - val_accuracy: 0.7441 - val_AUROC: 0.7157 - val_AUPRC: 0.3977
Epoch 2/100
625/625 - 22s - loss: 0.5288 - accuracy: 0.7434 - AUROC: 0.6834 - AUPRC: 0.3585 - val_loss: 0.5015 - val_accuracy: 0.7441 - val_AUROC: 0.7337 - val_AUPRC: 0.4222
Epoch 3/100
625/625 - 22s - loss: 0.5146 - accuracy: 0.7438 - AUROC: 0.7062 - AUPRC: 0.3843 - val_loss: 0.4993 - val_accuracy: 0.7441 - val_AUROC: 0.7360 - val_AUPRC: 0.4238
Epoch 4/100
625/625 - 23s - loss: 0.5053 - accuracy: 0.7441 - AUROC: 0.7191 - AUPRC: 0.3938 - val_loss: 0.5027 - val_accuracy: 0.7441 - val_AUROC: 0.7370 - val_AUPRC: 0.4194
Epoch 5/100
625/625 - 23s - loss: 0.4969 - accuracy: 0.7440 - AUROC: 0.7313 - AUPRC: 0.4078 - val_loss: 0.4958 - val_accuracy: 0.7441 - val_AUROC: 0.7437 - val_AUPRC: 0.4322
Epoch 6/100
625/625 - 23s - l



num_conv_units_2: 32
num_dense_nodes_2: 32
num_dense_nodes_1: 64
kernel_size_2: 5
dropout_prob: 0.5
Epoch 1/100
625/625 - 26s - loss: 0.5996 - accuracy: 0.7201 - AUROC: 0.5667 - AUPRC: 0.2854 - val_loss: 0.5281 - val_accuracy: 0.7441 - val_AUROC: 0.7033 - val_AUPRC: 0.3962
Epoch 2/100
625/625 - 21s - loss: 0.5229 - accuracy: 0.7435 - AUROC: 0.7046 - AUPRC: 0.3850 - val_loss: 0.5044 - val_accuracy: 0.7441 - val_AUROC: 0.7335 - val_AUPRC: 0.4213
Epoch 3/100
625/625 - 21s - loss: 0.5065 - accuracy: 0.7439 - AUROC: 0.7317 - AUPRC: 0.4171 - val_loss: 0.4981 - val_accuracy: 0.7441 - val_AUROC: 0.7429 - val_AUPRC: 0.4377
Epoch 4/100
625/625 - 21s - loss: 0.4956 - accuracy: 0.7440 - AUROC: 0.7462 - AUPRC: 0.4334 - val_loss: 0.4987 - val_accuracy: 0.7441 - val_AUROC: 0.7480 - val_AUPRC: 0.4432
Epoch 5/100
625/625 - 21s - loss: 0.4884 - accuracy: 0.7441 - AUROC: 0.7558 - AUPRC: 0.4462 - val_loss: 0.5037 - val_accuracy: 0.7441 - val_AUROC: 0.7488 - val_AUPRC: 0.4435
Epoch 6/100
625/625 - 21s - lo



num_conv_units_2: 32
num_dense_nodes_2: 16
num_dense_nodes_1: 32
kernel_size_2: 5
dropout_prob: 0.5
Epoch 1/100
625/625 - 25s - loss: 0.6137 - accuracy: 0.7086 - AUROC: 0.5623 - AUPRC: 0.2759 - val_loss: 0.5193 - val_accuracy: 0.7441 - val_AUROC: 0.7105 - val_AUPRC: 0.3928
Epoch 2/100
625/625 - 21s - loss: 0.5393 - accuracy: 0.7409 - AUROC: 0.6681 - AUPRC: 0.3416 - val_loss: 0.5087 - val_accuracy: 0.7441 - val_AUROC: 0.7216 - val_AUPRC: 0.3997
Epoch 3/100
625/625 - 21s - loss: 0.5226 - accuracy: 0.7432 - AUROC: 0.6952 - AUPRC: 0.3654 - val_loss: 0.5022 - val_accuracy: 0.7441 - val_AUROC: 0.7328 - val_AUPRC: 0.4153
Epoch 4/100
625/625 - 21s - loss: 0.5136 - accuracy: 0.7434 - AUROC: 0.7120 - AUPRC: 0.3845 - val_loss: 0.5053 - val_accuracy: 0.7441 - val_AUROC: 0.7328 - val_AUPRC: 0.4159
Epoch 5/100
625/625 - 22s - loss: 0.5045 - accuracy: 0.7438 - AUROC: 0.7271 - AUPRC: 0.4023 - val_loss: 0.5016 - val_accuracy: 0.7441 - val_AUROC: 0.7418 - val_AUPRC: 0.4320
Epoch 6/100
625/625 - 21s - lo

In [None]:
results = pd.DataFrame(sorted(zip(search_result.func_vals, search_result.x_iters)))
results.to_csv("best_models_promoters_1.0_cnn.csv")

In [None]:
from ast import literal_eval
results = pd.read_csv("best_models_promoters_1.0_cnn.csv")
best_configuration = literal_eval(results["1"][0])

In [None]:
op_num_conv_units_2 = best_configuration[0]
op_num_of_nodes_2 = best_configuration[1]
op_num_of_nodes_1 = best_configuration[2]
op_kernel_size_2 = best_configuration[3]
op_dropout_prob = best_configuration[4]

In [None]:
from tensorflow.keras.layers import Dense, Input, Conv1D, Conv2D, Reshape, Flatten, MaxPool1D, MaxPool2D, Dropout, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Nadam
from extra_keras_metrics import get_standard_binary_metrics

def train_cnn(
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    y_train: np.ndarray,
    y_test: np.ndarray,
    genome: Genome,
    batch_size: int,
    holdout_number: int
):
    """Return performance of a FFNN.
    
    Parameters
    ----------------------
    X_train: pd.DataFrame,
        Data reserved for the input during training of the model.
    X_test: pd.DataFrame,
        Data reserved for the input during  test of the model.
    y_train: np.ndarray,
        Data reserved for the output during  training of the model.
    y_test: np.ndarray,
        Data reserved for the output during  test of the model.
    genome: Genome,
        The genome object to use.
    holdout_number: int,
        Number of the holdout.
        
    Returns
    ----------------------
    Dictionary with the model perfomance.
    """
    train_sequence = build_sequence(X_train, y_train, genome, batch_size=batch_size)
    test_sequence = build_sequence(X_test, y_test, genome, batch_size=batch_size)
    
    cnn = Sequential([
        Input((256, 4)),
        Conv1D(64, kernel_size=5, activation="relu", padding="same"),
        BatchNormalization(),
        MaxPool1D(),
        Conv1D(64, kernel_size=5, activation="relu", padding="same"),
        BatchNormalization(),
        MaxPool1D(),
        Conv1D(64, kernel_size=5, activation="relu", padding="same"),
        BatchNormalization(),
        MaxPool1D(),
        Conv1D(op_num_conv_units_2, kernel_size=int(op_kernel_size_2), activation="relu", padding="same"),
        BatchNormalization(),
        MaxPool1D(),
        Flatten(),
        Dense(op_num_of_nodes_2, activation="relu"),
        Dropout(op_dropout_prob),
        Dense(op_num_of_nodes_1, activation="relu"),
        Dropout(op_dropout_prob),
        Dense(1, activation="sigmoid")
    ])
    cnn.compile(
        loss="binary_crossentropy",
        optimizer=Nadam(0.0002),
        metrics=get_standard_binary_metrics()
    )
    
    cnn.summary()
    
    history = pd.DataFrame(cnn.fit(
        train_sequence,
        validation_data=test_sequence,
        epochs=100,
        verbose=False,
        callbacks=[
            EarlyStopping(monitor = "val_loss", patience = 3),
            # I have commented this because we do not need this loading bar
            # when running the main experiment loop. When you experiment with
            # the model structure you may want to enable this to get a feel
            # of how the model is performing during the training.
            TqdmCallback(verbose=1)
        ]
    ).history)
    
    train_evaluation = dict(zip(cnn.metrics_names, cnn.evaluate(train_sequence, verbose=False)))
    test_evaluation = dict(zip(cnn.metrics_names, cnn.evaluate(test_sequence, verbose=False)))
    train_evaluation["run_type"] = "train"
    test_evaluation["run_type"] = "test"
    for evaluation in (train_evaluation, test_evaluation):
        evaluation["model_name"] = "CNN"
        evaluation["holdout_number"] = holdout_number
    
    evaluations = pd.DataFrame([
        train_evaluation,
        test_evaluation
    ])
    
    return history, evaluations

### Training of the model

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

# Create a list to store all the computed performance
all_performance = []
number_of_splits = 10
holdouts_generator = StratifiedShuffleSplit(
    n_splits=number_of_splits,
    test_size=0.2,
    random_state = 42
)

# Start the main loop, iterating through the holdouts
for holdout_number, (train_indices, test_indices) in tqdm(
    enumerate(holdouts_generator.split(bed, y)),
    total=number_of_splits,
    desc="Computing holdouts"
):
    X_train, X_test = bed.iloc[train_indices], bed.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
    # We compute the model performance
    history, performance = train_cnn(
        X_train, X_test, y_train.values, y_test.values,
        genome,
        batch_size=128,
        holdout_number=holdout_number
    )
    # We chain the computed performance to the performance list
    all_performance.append(performance)

    
# We convert the computed performance list into a DataFrame
all_performance = pd.concat(all_performance)

Computing holdouts:   0%|          | 0/10 [00:00<?, ?it/s]

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 256, 64)           1344      
_________________________________________________________________
batch_normalization (BatchNo (None, 256, 64)           256       
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 128, 64)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 128, 64)           20544     
_________________________________________________________________
batch_normalization_1 (Batch (None, 128, 64)           256       
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 64, 64)            0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 64, 64)            2

0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_4 (Conv1D)            (None, 256, 64)           1344      
_________________________________________________________________
batch_normalization_4 (Batch (None, 256, 64)           256       
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 128, 64)           0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 128, 64)           20544     
_________________________________________________________________
batch_normalization_5 (Batch (None, 128, 64)           256       
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 64, 64)            0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 64, 64)           

0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_8 (Conv1D)            (None, 256, 64)           1344      
_________________________________________________________________
batch_normalization_8 (Batch (None, 256, 64)           256       
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 128, 64)           0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 128, 64)           20544     
_________________________________________________________________
batch_normalization_9 (Batch (None, 128, 64)           256       
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 64, 64)            0         
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 64, 64)           

0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_12 (Conv1D)           (None, 256, 64)           1344      
_________________________________________________________________
batch_normalization_12 (Batc (None, 256, 64)           256       
_________________________________________________________________
max_pooling1d_12 (MaxPooling (None, 128, 64)           0         
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 128, 64)           20544     
_________________________________________________________________
batch_normalization_13 (Batc (None, 128, 64)           256       
_________________________________________________________________
max_pooling1d_13 (MaxPooling (None, 64, 64)            0         
_________________________________________________________________
conv1d_14 (Conv1D)           (None, 64, 64)           

0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_16 (Conv1D)           (None, 256, 64)           1344      
_________________________________________________________________
batch_normalization_16 (Batc (None, 256, 64)           256       
_________________________________________________________________
max_pooling1d_16 (MaxPooling (None, 128, 64)           0         
_________________________________________________________________
conv1d_17 (Conv1D)           (None, 128, 64)           20544     
_________________________________________________________________
batch_normalization_17 (Batc (None, 128, 64)           256       
_________________________________________________________________
max_pooling1d_17 (MaxPooling (None, 64, 64)            0         
_________________________________________________________________
conv1d_18 (Conv1D)           (None, 64, 64)           

0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_20 (Conv1D)           (None, 256, 64)           1344      
_________________________________________________________________
batch_normalization_20 (Batc (None, 256, 64)           256       
_________________________________________________________________
max_pooling1d_20 (MaxPooling (None, 128, 64)           0         
_________________________________________________________________
conv1d_21 (Conv1D)           (None, 128, 64)           20544     
_________________________________________________________________
batch_normalization_21 (Batc (None, 128, 64)           256       
_________________________________________________________________
max_pooling1d_21 (MaxPooling (None, 64, 64)            0         
_________________________________________________________________
conv1d_22 (Conv1D)           (None, 64, 64)           

0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_24 (Conv1D)           (None, 256, 64)           1344      
_________________________________________________________________
batch_normalization_24 (Batc (None, 256, 64)           256       
_________________________________________________________________
max_pooling1d_24 (MaxPooling (None, 128, 64)           0         
_________________________________________________________________
conv1d_25 (Conv1D)           (None, 128, 64)           20544     
_________________________________________________________________
batch_normalization_25 (Batc (None, 128, 64)           256       
_________________________________________________________________
max_pooling1d_25 (MaxPooling (None, 64, 64)            0         
_________________________________________________________________
conv1d_26 (Conv1D)           (None, 64, 64)           

0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_28 (Conv1D)           (None, 256, 64)           1344      
_________________________________________________________________
batch_normalization_28 (Batc (None, 256, 64)           256       
_________________________________________________________________
max_pooling1d_28 (MaxPooling (None, 128, 64)           0         
_________________________________________________________________
conv1d_29 (Conv1D)           (None, 128, 64)           20544     
_________________________________________________________________
batch_normalization_29 (Batc (None, 128, 64)           256       
_________________________________________________________________
max_pooling1d_29 (MaxPooling (None, 64, 64)            0         
_________________________________________________________________
conv1d_30 (Conv1D)           (None, 64, 64)           

0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_32 (Conv1D)           (None, 256, 64)           1344      
_________________________________________________________________
batch_normalization_32 (Batc (None, 256, 64)           256       
_________________________________________________________________
max_pooling1d_32 (MaxPooling (None, 128, 64)           0         
_________________________________________________________________
conv1d_33 (Conv1D)           (None, 128, 64)           20544     
_________________________________________________________________
batch_normalization_33 (Batc (None, 128, 64)           256       
_________________________________________________________________
max_pooling1d_33 (MaxPooling (None, 64, 64)            0         
_________________________________________________________________
conv1d_34 (Conv1D)           (None, 64, 64)           

0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_36 (Conv1D)           (None, 256, 64)           1344      
_________________________________________________________________
batch_normalization_36 (Batc (None, 256, 64)           256       
_________________________________________________________________
max_pooling1d_36 (MaxPooling (None, 128, 64)           0         
_________________________________________________________________
conv1d_37 (Conv1D)           (None, 128, 64)           20544     
_________________________________________________________________
batch_normalization_37 (Batc (None, 128, 64)           256       
_________________________________________________________________
max_pooling1d_37 (MaxPooling (None, 64, 64)            0         
_________________________________________________________________
conv1d_38 (Conv1D)           (None, 64, 64)           

0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

In [None]:
all_performance

Unnamed: 0,loss,accuracy,recall,precision,AUROC,AUPRC,f1_score,balanced_accuracy,specificity,miss_rate,fall_out,mcc,run_type,model_name,holdout_number
0,0.49996,0.744105,0.0,0.0,0.787385,0.469874,0.0,0.5,1.0,1.0,0.0,0.0,train,CNN,0
1,0.519216,0.744106,0.0,0.0,0.739555,0.422866,0.0,0.5,1.0,1.0,0.0,0.0,test,CNN,0
0,0.440245,0.744105,0.0,0.0,0.818044,0.51927,0.0,0.5,1.0,1.0,0.0,0.0,train,CNN,1
1,0.493121,0.744106,0.0,0.0,0.749405,0.45329,0.0,0.5,1.0,1.0,0.0,0.0,test,CNN,1
0,0.435124,0.744105,0.0,0.0,0.827181,0.532153,0.0,0.5,1.0,1.0,0.0,0.0,train,CNN,2
1,0.496205,0.744106,0.0,0.0,0.745166,0.448567,0.0,0.5,1.0,1.0,0.0,0.0,test,CNN,2
0,0.450484,0.744105,0.0,0.0,0.820548,0.521083,0.0,0.5,1.0,1.0,0.0,0.0,train,CNN,3
1,0.488964,0.744106,0.0,0.0,0.75652,0.451381,0.0,0.5,1.0,1.0,0.0,0.0,test,CNN,3
0,0.384097,0.800411,0.302538,0.785723,0.882995,0.694167,0.436864,0.637082,0.971627,0.697462,0.028373,0.401419,train,CNN,4
1,0.497716,0.759173,0.213615,0.579926,0.756849,0.49285,0.312223,0.580201,0.946788,0.786385,0.053212,0.239551,test,CNN,4


In [None]:
all_performance.to_csv("all_performances_promoters_1.0_cnn.csv")

In [None]:
all_performance = pd.read_csv("all_performances_promoters_1.0_cnn.csv")

In [None]:
all_performance = all_performance.filter(items=['model_name', 'run_type', 'accuracy', 'AUROC', 'AUPRC'])
all_performance.groupby(['run_type']).mean()

Unnamed: 0_level_0,accuracy,AUROC,AUPRC
run_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
test,0.746594,0.750162,0.456269
train,0.753285,0.830442,0.551335


In [None]:
barplots(
    all_performance,
    groupby=["model_name", "run_type"],
    orientation="horizontal",
    height=8
)

Rendering barplots:   0%|          | 0/3 [00:00<?, ?it/s]

  figure.tight_layout()
  figure.tight_layout()
Exception in thread Thread-18:
Traceback (most recent call last):
  File "/usr/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.7/multiprocessing/pool.py", line 470, in _handle_results
    task = get()
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 251, in recv
    return _ForkingPickler.loads(buf.getbuffer())
  File "/usr/local/lib/python3.7/dist-packages/matplotlib/figure.py", line 2038, in __setstate__
    mgr = plt._backend_mod.new_figure_manager_given_figure(num, self)
AttributeError: module 'ipykernel.pylab.backend_inline' has no attribute 'new_figure_manager_given_figure'

Process ForkPoolWorker-4:
Process ForkPoolWorker-3:
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/us

AssertionError: ignored