In [None]:
!pip install minepy
!pip install barplots
!pip install extra_keras_metrics
!pip install epigenomic_dataset
!pip install ucsc_genomes_downloader
!pip install keras_bed_sequence

Collecting minepy
  Downloading minepy-1.2.5.tar.gz (495 kB)
[?25l[K     |▋                               | 10 kB 25.4 MB/s eta 0:00:01[K     |█▎                              | 20 kB 8.7 MB/s eta 0:00:01[K     |██                              | 30 kB 7.4 MB/s eta 0:00:01[K     |██▋                             | 40 kB 6.9 MB/s eta 0:00:01[K     |███▎                            | 51 kB 5.1 MB/s eta 0:00:01[K     |████                            | 61 kB 5.2 MB/s eta 0:00:01[K     |████▋                           | 71 kB 5.3 MB/s eta 0:00:01[K     |█████▎                          | 81 kB 5.9 MB/s eta 0:00:01[K     |██████                          | 92 kB 4.7 MB/s eta 0:00:01[K     |██████▋                         | 102 kB 5.0 MB/s eta 0:00:01[K     |███████▎                        | 112 kB 5.0 MB/s eta 0:00:01[K     |████████                        | 122 kB 5.0 MB/s eta 0:00:01[K     |████████▋                       | 133 kB 5.0 MB/s eta 0:00:01[K     |████████

In [None]:
from epigenomic_dataset import active_promoters_vs_inactive_promoters
from epigenomic_dataset.utils import normalize_epigenomic_data
from sklearn.impute import KNNImputer
from sklearn.preprocessing import RobustScaler
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from cache_decorator import Cache
from tqdm.keras import TqdmCallback
from barplots import barplots
from keras_mixed_sequence import MixedSequence, VectorSequence
from keras_bed_sequence import BedSequence
from ucsc_genomes_downloader import Genome

## Data retrieval
First, we retrieve the data and impute and scale them.

In [None]:
cell_line = "H1"
window_size = 256
task = "active_promoters_vs_inactive_promoters"

X, y = active_promoters_vs_inactive_promoters(
    cell_line=cell_line,
    window_size=window_size,
    binarize = True,
    min_active_tpm_value = 0.5,
    max_inactive_tpm_value = 0.5
)

genome = Genome("hg38")

bed_X = X.reset_index()
bed = bed_X[bed_X.columns[:5]]

Downloading to datasets/fantom/...omoters/H1.csv.xz:   0%|          | 0.00/11.7M [00:00<?, ?iB/s]

Downloading to datasets/fantom/.../promoters.bed.xz:   0%|          | 0.00/1.22M [00:00<?, ?iB/s]

Downloading chromosomes for genome hg38:   0%|          | 0/25 [00:00<?, ?it/s]

Loading chromosomes for genome hg38:   0%|          | 0/25 [00:00<?, ?it/s]

In [1]:
#@title Build of the MMNN model by the best configuration of the FFNN and CNN models


In [None]:
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, ReLU, Concatenate, Layer
from tensorflow.keras.layers import Conv1D, MaxPool1D, GlobalAveragePooling1D, GlobalMaxPool1D, Flatten
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from extra_keras_metrics import get_complete_binary_metrics

In [None]:
from ast import literal_eval
best_configuration = pd.read_csv("best_models_promoters_0.5_ffnn.csv")
best_configuration = literal_eval(best_configuration["1"][0])

In [None]:
op_learning_rate = best_configuration[0]
op_num_dense_layers = best_configuration[1]
op_num_of_nodes_3 = best_configuration[2]
op_num_of_nodes_2 = best_configuration[3]
op_num_of_nodes_1 = best_configuration[4]
op_l2_regularization = best_configuration[5]

In [None]:
from tensorflow.keras.layers import Dense, InputLayer, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.regularizers import l2
from tensorflow.keras.metrics import AUC, Accuracy

def build_ffnn(X:pd.DataFrame):
    input_epigenomic_data = Input(shape=(X.shape[1]), name="epigenomic_data")
    hidden = input_epigenomic_data
    #Composition of the 3th layer
    if op_num_dense_layers == 3:
        name = 'layer_dense_3'
        hidden = Dense(op_num_of_nodes_2,
                        activation='relu',
                        kernel_regularizer=l2(op_l2_regularization), #I put the same regularization_factor 
                                                                         #to all the dense layers;
                        name=name)(hidden)
        
    #Composition of the 2th layer
    if op_num_dense_layers == 2:
        name = 'layer_dense_2'
        hidden = Dense(op_num_of_nodes_2,
                        activation='relu',
                        kernel_regularizer=l2(op_l2_regularization),
                        name=name)(hidden)
        
    #Composition of the 1th layer
    name = 'layer_dense_1'
    hidden = Dense(op_num_of_nodes_1,
                activation='relu',
                kernel_regularizer=l2(op_l2_regularization),
                name=name)(hidden)
    
    last_hidden_ffnn = hidden

    # use softmax-activation for classification.
    output_ffnn = Dense(1, activation='sigmoid')(hidden)
    
    ffnn = Model(
        inputs=input_epigenomic_data,
        outputs=output_ffnn,
        name="FFNN"
    )
    
    
    ffnn.compile(
        loss="binary_crossentropy",
        optimizer=Adam(learning_rate=op_learning_rate),
        metrics=get_standard_binary_metrics()
    )

    return ffnn, input_epigenomic_data, last_hidden_ffnn

In [None]:
from ast import literal_eval
best_configuration = pd.read_csv("best_models_promoters_0.5_cnn.csv")
best_configuration = literal_eval(best_configuration["1"][0])

In [None]:
op_learning_rate = best_configuration[0]
op_num_conv_units_2 = best_configuration[1]
op_num_of_nodes_2 = best_configuration[2]
op_num_of_nodes_1 = best_configuration[3]
op_kernel_size_2 = best_configuration[4]

In [None]:
best_configuration

[0.01, 32, 16, 64, 5]

In [None]:
from tensorflow.keras.layers import Dense, Input, Conv1D, Conv2D, Reshape, Flatten, MaxPool1D, MaxPool2D, Dropout, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from extra_keras_metrics import get_standard_binary_metrics

def build_cnn(window_size: int):
    input_sequence_data = Input(shape=(window_size, 4), name="sequence_data")
    hidden = Conv1D(64, kernel_size=5, activation="relu", padding="same")(input_sequence_data)
    hidden = BatchNormalization()(hidden)
    hidden = MaxPool1D()(hidden)
    hidden = Conv1D(op_num_conv_units_2, kernel_size=int(op_kernel_size_2), activation="relu", padding="same")(hidden)
    hidden = BatchNormalization()(hidden)
    hidden = MaxPool1D()(hidden)
    hidden = Flatten()(hidden)
    hidden = Dense(op_num_of_nodes_2, activation="relu")(hidden)
    hidden = Dropout(0.1)(hidden)
    hidden = Dense(op_num_of_nodes_1, activation="relu")(hidden)
    last_hidden_cnn = Dropout(0.1)(hidden)
    
    output_cnn = Dense(1, activation="sigmoid")(last_hidden_cnn)
    
    cnn = Model(
        inputs=input_sequence_data,
        outputs=output_cnn,
        name="CNN"
    )
    
    cnn.compile(
        loss="binary_crossentropy",
        optimizer=Adam(op_learning_rate),
        metrics=get_standard_binary_metrics()
    )
    
    return cnn, input_sequence_data, last_hidden_cnn

In [None]:
def build_mmnn(
    X,
    window_size,
    input_sequence_data,
    input_epigenomic_data,
    last_hidden_cnn,
    last_hidden_ffnn
):
    
    concatenation_layer = Concatenate()([
        last_hidden_ffnn,
        last_hidden_cnn
    ])

    last_hidden_mmnn = concatenation_layer
    output_mmnn = Dense(1, activation="sigmoid")(last_hidden_mmnn)

    mmnn = Model(
        inputs=[input_epigenomic_data, input_sequence_data],
        outputs=output_mmnn,
        name="MMNN"
    )

    mmnn.compile(
        optimizer="nadam",
        loss="binary_crossentropy",
        metrics=get_complete_binary_metrics()
    )

    return mmnn

In [None]:
def get_cnn_sequence(
    genome:Genome,
    bed:pd.DataFrame,
    y:np.ndarray,
    batch_size=128
) -> MixedSequence:
    return MixedSequence(
        x={
            "sequence_data": BedSequence(
                genome,
                bed,
                batch_size=batch_size,
            )
        },
        y=VectorSequence(
            y,
            batch_size=batch_size
        )
    )

def get_ffnn_sequence(
    X:np.ndarray,
    y:np.ndarray,
    batch_size=128
) -> MixedSequence:
    return MixedSequence(
        x={
            "epigenomic_data": VectorSequence(
                X,
                batch_size
            )
        },
        y=VectorSequence(
            y,
            batch_size=batch_size
        )
    )

def get_mmnn_sequence(
    genome:Genome,
    bed:pd.DataFrame,
    X:np.ndarray,
    y:np.ndarray,
    batch_size=128
) -> MixedSequence:
    return MixedSequence(
        x={
            "sequence_data": BedSequence(
                genome,
                bed,
                batch_size=batch_size,
            ),
            "epigenomic_data": VectorSequence(
                X,
                batch_size
            )
        },
        y=VectorSequence(
            y,
            batch_size=batch_size
        )
    )

### Evaluation of the models predictions
In order to evaluate the models predictions, we consider the Accuracy, AUPRC and AUROC metrics:

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

def train_model(
    model:Model,
    model_name: str,
    task: str,
    training_sequence:MixedSequence,
    test_sequence:MixedSequence,
    holdout_number: int
):
    history = pd.DataFrame(model.fit(
        train_sequence,
        validation_data=test_sequence,
        epochs=100,
        verbose=False,
        callbacks=[
            EarlyStopping("val_loss", patience = 2),
            # I have commented this because we do not need this loading bar
            # when running the main experiment loop. When you experiment with
            # the model structure you may want to enable this to get a feel
            # of how the model is performing during the training.
            TqdmCallback(verbose=1)
        ]
    ).history)
    
    train_evaluation = dict(zip(model.metrics_names, model.evaluate(train_sequence, verbose=False)))
    test_evaluation = dict(zip(model.metrics_names, model.evaluate(test_sequence, verbose=False)))
    train_evaluation["run_type"] = "train"
    test_evaluation["run_type"] = "test"
    for evaluation in (train_evaluation, test_evaluation):
        evaluation["model_name"] = model_name
        evaluation["holdout_number"] = holdout_number
    
    evaluations = pd.DataFrame([
        train_evaluation,
        test_evaluation
    ])
    
    return history, evaluations

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

number_of_splits = 10

holdouts_generator = StratifiedShuffleSplit(
    n_splits=number_of_splits,
    test_size=0.2
)

In [None]:
# Create a list to store all the computed performance
all_performance = []

# Start the main loop, iterating through the holdouts
for holdout_number, (train_indices, test_indices) in tqdm(
    enumerate(holdouts_generator.split(X, y)),
    total=number_of_splits,
    desc="Computing holdouts"
):
    train_bed, test_bed = bed.iloc[train_indices], bed.iloc[test_indices]
    train_X, test_X = X.iloc[train_indices], X.iloc[test_indices]
    train_X, test_X = normalize_epigenomic_data(train_x=train_X,test_x=test_X)
    train_y, test_y = y.iloc[train_indices], y.iloc[test_indices]
    
    train_y = train_y.values.flatten()
    test_y = test_y.values.flatten()
    
    ffnn, input_epigenomic_data, last_hidden_ffnn = build_ffnn(train_X)
    cnn, input_sequence_data, last_hidden_cnn = build_cnn(window_size)
    model = build_mmnn(
        train_X, 
        window_size,
        input_sequence_data=input_sequence_data,
        input_epigenomic_data=input_epigenomic_data,
        last_hidden_ffnn=last_hidden_ffnn,
        last_hidden_cnn=last_hidden_cnn
    )
    
    train_sequence = get_mmnn_sequence(genome, train_bed, train_X, train_y)
    test_sequence = get_mmnn_sequence(genome, test_bed, test_X, test_y)

    # We compute the model performance
    history, performance = train_model(
        model,
        model.name,
        task,
        train_sequence,
        test_sequence,
        holdout_number
    )
    
    # We chain the computed performance to the performance list
    all_performance.append(performance)
        
# We convert the computed performance list into a DataFrame
all_performance = pd.concat(all_performance)

Computing holdouts:   0%|          | 0/10 [00:00<?, ?it/s]



0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))


0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))


0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))


0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))


0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))


0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))


0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))


0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))


0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))


0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))
  'consistency.' % (self.__class__.__name__,))


In [None]:
all_performance.to_csv("all_performances_promoters_0.5_mmnn.csv")

In [None]:
all_performance = pd.read_csv("all_performances_promoters_0.5_mmnn.csv")

In [None]:
all_performance = all_performance.filter(items=['model_name', 'run_type', 'accuracy', 'AUROC', 'AUPRC'])
all_performance.groupby(['run_type', 'model_name']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,AUROC,AUPRC
run_type,model_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
test,MMNN,0.786279,0.848382,0.738556
train,MMNN,0.828594,0.896959,0.828535


### Results visualization
Now that we have run our experiment we can visualize its performance:

In [None]:
barplots(
    all_performance,
    groupby=["model_name", "run_type"],
    orientation="horizontal",
    height=8
)

In [None]:
from scipy.stats import wilcoxon

In [None]:
for outer_model in all_performance.model_name.unique():
    outer_model_performance = all_performance[
        (all_performance.model_name == outer_model) &
        (all_performance.run_type == "test")]
    for model in all_performance.model_name.unique():
        if outer_model >= model:
            continue
        model_performance = all_performance[
            (all_performance.model_name == model) &
            (all_performance.run_type == "test") 
        ]

        for metric in ("AUPRC", "AUROC", "accuracy"):
            outer, inner = outer_model_performance[metric], model_performance[metric]
            _, p_value = wilcoxon(outer, inner)
            if p_value < 0.01:
                if outer.mean() > inner.mean():
                    best_model, worse_model = outer_model, model
                else:
                    best_model, worse_model = model, outer_model
                print("The model {} outperforms the model {} with p-value {} on metric {}.".format(
                    best_model,
                    worse_model,
                    p_value,
                    metric
                ))
            else:
                print("The model {} is statistially indistinguishiable {} with p-value {} on metric {}.".format(
                    outer_model,
                    model,
                    p_value,
                    metric
                ))