In [1]:
from repairing_genomic_gaps import build_denoiser, build_dataset
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

In [2]:
window_size=200

In [3]:
model = build_denoiser(window_size)

Model: "encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_input (InputLayer)   [(None, 200, 4)]          0         
_________________________________________________________________
reshape (Reshape)            (None, 200, 4, 1)         0         
_________________________________________________________________
conv2d (Conv2D)              (None, 100, 4, 64)        2368      
_________________________________________________________________
batch_normalization (BatchNo (None, 100, 4, 64)        256       
_________________________________________________________________
activation (Activation)      (None, 100, 4, 64)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 50, 2, 32)         49184     
_________________________________________________________________
batch_normalization_1 (Batch (None, 50, 2, 32)         128 

In [4]:
model.load_weights("best_small_model.hdf5")

In [5]:
df = pd.read_csv("out_alignment_19_38.txt", sep=" ", index_col=0)

In [6]:
max_gap_size = 3
window_size = 200
batch_size = 250
epochs = 1000

train, test = build_dataset(
    assembly="hg19",
    training_chromosomes=[
        "chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9",
        "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr19",
        "chr20", "chr21", "chr22", "chrX", "chrY"
    ],
    testing_chromosomes=[
        "chr17",
        "chr18",
        "chrM",
    ],
    max_gap_size=max_gap_size,
    window_size=window_size,
    gaps_threshold=0.4,
    batch_size=batch_size,
    seed=42
)

Loading cache at ./cache/build_dataset/195e4115b26c44b488fc638fc4faf33eacdbf73b464f3c0d1d218fe8316a3b08.pkl


In [87]:
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, classification_report, multilabel_confusion_matrix

In [76]:
center_mask = np.zeros(200).astype(bool)
center_mask[[98, 99, 100, 101, 102]] = True

In [105]:
data = []

for step in tqdm(range(test.steps_per_epoch)):
    X, y = test[step]
    predictions = model.predict(X)
    n_mask = np.logical_not(np.isclose(X, 0.25).all(axis=2))
    y_true, y_pred = y[n_mask], predictions[n_mask]
    data.append({
        "accuracy":accuracy_score(y_true.argmax(-1), y_pred.argmax(-1)),
        "auroc":roc_auc_score(y_true.flatten(), y_pred.flatten()),
        "auprc":average_precision_score(y_true.flatten(), y_pred.flatten())
    })
    if step > 100:
        break

HBox(children=(IntProgress(value=0, max=3015), HTML(value='')))

In [106]:
pd.DataFrame(data).mean()

accuracy    0.946511
auroc       0.996598
auprc       0.990919
dtype: float64