# Prediction of Active Enhancers with CNN

In [1]:
import pandas as pd
import numpy as np
from typing import Tuple
import os
import compress_json
from tqdm.auto import tqdm
from plot_keras_history import plot_history
from barplots import barplots

In [2]:
models = []

In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Flatten
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import AUC

def build_perceptron():
    perceptron = Sequential([
        Input(shape=(200, 4)),
        Flatten(),
        Dense(1, activation="sigmoid")
    ], "Perceptron")

    perceptron.compile(
        optimizer="nadam",
        loss="binary_crossentropy",
        metrics=[
            "accuracy",
            AUC(curve="ROC", name="auroc"),
            AUC(curve="PR", name="auprc")
        ]
    )
    return perceptron

models.append(build_perceptron)

In [4]:
def build_mlp():
    mlp = Sequential([
        Input(shape=(200, 4)),
        Flatten(),
        Dense(64, activation="relu"),
        Dense(32, activation="relu"),
        Dense(1, activation="sigmoid")
    ], "MLP")

    mlp.compile(
        optimizer="nadam",
        loss="binary_crossentropy",
        metrics=[
            "accuracy",
            AUC(curve="ROC", name="auroc"),
            AUC(curve="PR", name="auprc")
        ]
    )
    return mlp

models.append(build_mlp)

In [5]:
from tensorflow.keras.layers import BatchNormalization, Dropout, Activation

def build_ffnn():
    ffnn = Sequential([
        Input(shape=(200, 4)),
        Flatten(),
        Dense(128, activation="relu"),
        Dense(64, activation="relu"),
        Dropout(0.3),
        Dense(64, activation="relu"),
        Dropout(0.3),
        Dense(32, activation="relu"),
        Dense(16, activation="relu"),
        Dense(1, activation="sigmoid")
    ], "FFNN")

    ffnn.compile(
        optimizer="nadam",
        loss="binary_crossentropy",
        metrics=[
            "accuracy",
            AUC(curve="ROC", name="auroc"),
            AUC(curve="PR", name="auprc")
        ]
    )
    return ffnn

models.append(build_ffnn)

In [7]:
from tensorflow.keras.layers import Conv2D, Reshape

def build_cnn():
    cnn = Sequential([
        Input(shape=(200, 4)),
        Reshape((200, 4, 1)),
        Conv2D(64, kernel_size=(10, 2), activation="relu"),
        Conv2D(64, kernel_size=(10, 2), activation="relu"),
        Dropout(0.3),
        Conv2D(32, kernel_size=(10, 2), strides=(2, 1), activation="relu"),
        Conv2D(32, kernel_size=(10, 1), activation="relu"),
        Conv2D(32, kernel_size=(10, 1), activation="relu"),
        Dropout(0.3),
        Flatten(),
        Dense(32, activation="relu"),
        Dense(16, activation="relu"),
        Dense(1, activation="sigmoid")
    ], "CNN")

    cnn.compile(
        optimizer="nadam",
        loss="binary_crossentropy",
        metrics=[
            "accuracy",
            AUC(curve="ROC", name="auroc"),
            AUC(curve="PR", name="auprc")
        ]
    )
    return cnn

models.append(build_cnn)

In [9]:
from tensorflow.keras.layers import LSTM

def build_lstm():
    cudnn_lstm = dict(
        activation="tanh",
        recurrent_activation="sigmoid",
        recurrent_dropout=0,
        unroll=False,
        use_bias=True
    )

    lstm = Sequential([
        Input(shape=(200, 4)),
        LSTM(256, **cudnn_lstm),
        Flatten(),
        Dense(256, activation="relu"),
        Dropout(0.3),
        Dense(32, activation="relu"),
        Dense(16, activation="relu"),
        Dense(1, activation="sigmoid")
    ], "LSTM")

    lstm.compile(
        optimizer="nadam",
        loss="binary_crossentropy",
        metrics=[
            "accuracy",
            AUC(curve="ROC", name="auroc"),
            AUC(curve="PR", name="auprc")
        ]
    )
    return lstm

models.append(build_lstm)

In [10]:
from epigenomic_dataset import load_epigenomes

cell_line = "GM12878"
window_size = 200

epigenomes, labels = load_epigenomes(
    cell_line = cell_line,
    dataset = "fantom",
    regions = "enhancers",
    window_size = window_size
)

labels = labels.values.ravel()

bed = epigenomes.reset_index()[epigenomes.index.names]

In [11]:
from sklearn.model_selection import StratifiedShuffleSplit

splits = 2
holdouts = StratifiedShuffleSplit(n_splits=splits, test_size=0.2, random_state=42)

In [12]:
from ucsc_genomes_downloader import Genome
from keras_bed_sequence import BedSequence
from keras_mixed_sequence import MixedSequence
from tensorflow.keras.utils import Sequence

genome = Genome("hg19")

def get_holdout(train:np.ndarray, test:np.ndarray, bed:pd.DataFrame, labels:np.ndarray, genome:genome, batch_size=1024)->Tuple[Sequence, Sequence]:
    return (
        MixedSequence(
            x=BedSequence(genome, bed.iloc[train], batch_size=batch_size),
            y=labels[train],
            batch_size=batch_size
        ),
        MixedSequence(
            x= BedSequence(genome, bed.iloc[test], batch_size=batch_size),
            y=labels[test],
            batch_size=batch_size
        )
    )

HBox(children=(FloatProgress(value=0.0, description='Downloading chromosomes for genome hg19', layout=Layout(f…

Process ForkPoolWorker-6:
Process ForkPoolWorker-5:
Process ForkPoolWorker-3:
Process ForkPoolWorker-4:
Process ForkPoolWorker-2:
Process ForkPoolWorker-12:
Process ForkPoolWorker-11:
Process ForkPoolWorker-10:
Process ForkPoolWorker-9:
Process ForkPoolWorker-8:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/local/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/local/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/local/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/anaconda3/li

KeyboardInterrupt: 

In [None]:
def precomputed(results, model:str, holdout:int)->bool:
    df = pd.DataFrame(results)
    if df.empty:
        return False
    return (
        (df.model == model) &
        (df.holdout == holdout)
    ).any()

In [None]:
if os.path.exists("sequence.json"):
    results = compress_json.local_load("sequence.json")
else:
    results = []

for i, (train_index, test_index) in tqdm(enumerate(holdouts.split(bed, labels)), total=splits, desc="Computing holdouts", dynamic_ncols=True):
    train, test = get_holdout(train_index, test_index, bed, labels, genome)
    for build_model in tqdm(models, total=len(models), desc="Training models", leave=False, dynamic_ncols=True):
        model = build_model()
        if precomputed(results, model.name, i):
            continue
        history = model.fit(
            train,
            steps_per_epoch=train.steps_per_epoch,
            validation_data=test,
            validation_steps=test.steps_per_epoch,
            epochs=1000,
            shuffle=True,
            verbose=False,
            callbacks=[
                EarlyStopping(monitor="val_loss", mode="min", patience=50),
            ]
        ).history
        scores = pd.DataFrame(history).iloc[-1].to_dict()
        results.append({
            "model":model.name,
            "run_type":"train",
            "holdout":i,
            **{
                key:value
                for key, value in scores.items()
                if not key.startswith("val_")
            }
        })
        results.append({
            "model":model.name,
            "run_type":"test",
            "holdout":i,
            **{
                key[4:]:value
                for key, value in scores.items()
                if key.startswith("val_")
            }
        })
        compress_json.local_dump(results, "sequence.json")

In [None]:
df = pd.DataFrame(results).drop(columns="holdout")

In [None]:
df

In [None]:
barplots(
    df,
    groupby=["model", "run_type"],
    show_legend=False,
    height=5,
    orientation="horizontal",
    path='barplots/sequence/{feature}.png',
)

In [None]:
from PIL import Image
from glob import glob

for x in glob("barplots/sequence/*.png"):
    display(Image.open(x))

In [None]:
labels.mean()