In [1]:
from keras_synthetic_genome_sequence import MultivariateGapWindowsSequence
from keras_synthetic_genome_sequence.utils import get_gaps_statistics
from ucsc_genomes_downloader import Genome
from ucsc_genomes_downloader.utils import tessellate_bed
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import wilcoxon, pearsonr, ks_2samp, chisquare

In [2]:
assembly = Genome("hg19")

HBox(children=(IntProgress(value=0, description='Loading chromosomes for genome hg19', layout=Layout(flex='2')…



In [5]:
window_size = 1000
batch_size = 100000
max_gap_size = 3

In [6]:
number, mean, covariance = get_gaps_statistics(
    genome=assembly,
    max_gap_size=max_gap_size,
    window_size=window_size
)

HBox(children=(IntProgress(value=0, description='Rendering gaps in hg19', layout=Layout(flex='2'), max=25, sty…



HBox(children=(IntProgress(value=0, description='Rendering sequences in hg19', layout=Layout(flex='2'), max=8,…



In [7]:
ground_truth = tessellate_bed(assembly.filled(), window_size=window_size)

HBox(children=(IntProgress(value=0, description='Rendering gaps in hg19', layout=Layout(flex='2'), max=25, sty…



HBox(children=(IntProgress(value=0, description='Tessellating windows', layout=Layout(flex='2'), max=385, styl…



In [None]:
gap_sequence = MultivariateGapWindowsSequence(
    assembly=assembly,
    bed=ground_truth,
    gaps_mean=mean,
    gaps_covariance=covariance,
    batch_size=batch_size
)

HBox(children=(IntProgress(value=0, description='Rendering sequences in hg19', layout=Layout(flex='2'), max=69…



HBox(children=(IntProgress(value=0, description='Converting nucleotides to numeric classes', layout=Layout(fle…



HBox(children=(IntProgress(value=0, description='Generating synthetic gaps', layout=Layout(flex='2'), max=58, …

In [None]:
X, y = gap_sequence[0]
synthetic_mean = np.isclose(X, 0.25).all(axis=-1).mean(axis=0)

In [None]:
plt.bar(range(len(synthetic_mean)), synthetic_mean, width=1)
plt.title(f"Synthetic gaps frequencies ({assembly})")
plt.ylabel("Synthetic gaps frequency")
plt.xlabel("Nucleotides position")
plt.savefig(f"synthetic_gaps_{assembly}.png")
plt.close()

plt.bar(range(len(mean)), mean, width=1)
plt.title(f"Biological gaps frequencies ({assembly})")
plt.ylabel("Biological gaps frequency")
plt.xlabel("Nucleotides position")
plt.savefig(f"biological_gaps_{assembly}.png")
plt.close()

print(assembly)
print("Gaps number", number)
for test in (wilcoxon, ks_2samp):
    print(test.__name__, test(mean, synthetic_mean))

HBox(children=(IntProgress(value=0, description='Rendering gaps in hg19', layout=Layout(flex='2'), max=25, sty…



HBox(children=(IntProgress(value=0, description='Rendering sequences in hg19', layout=Layout(flex='2'), max=8,…



HBox(children=(IntProgress(value=0, description='Rendering gaps in hg19', layout=Layout(flex='2'), max=25, sty…



HBox(children=(IntProgress(value=0, description='Tessellating windows', layout=Layout(flex='2'), max=385, styl…



HBox(children=(IntProgress(value=0, description='Rendering sequences in hg19', layout=Layout(flex='2'), max=69…



HBox(children=(IntProgress(value=0, description='Converting nucleotides to numeric classes', layout=Layout(fle…



HBox(children=(IntProgress(value=0, description='Generating synthetic gaps', layout=Layout(flex='2'), max=58, …