In [1]:
from crr_labels import fantom, roadmap, roadmap_available_cell_lines, fantom_available_cell_lines
import os
from tqdm.auto import tqdm
import pandas as pd
from tabulate import tabulate

In [2]:
common_cell_lines = [
    "GM12878", "HelaS3", "HepG2", "K562", "A549", "H1", "H9"
]
fantom_cell_lines = [
    "MCF7", "HEK293", "Caco2", "HL60", "PC3", "JURKAT"
]
roadmap_cell_lines = [
    "DND41", "HUES48", "HUES6", "HUES64", "IMR90"
]

In [3]:
windows = (64, 128, 256, 512, 1024)
states = (15, 18)
centers = ("peak", "center")
assemblies = ("hg19", "hg38")
# We are not computing RoadMap right now because we are still choosing the states from the model to be used.
compute_roadmap = False

In [4]:
url_pattern = "`Download <https://raw.githubusercontent.com/LucaCappelletti94/crr_labels/master/{url}>`__"

In [5]:
fantom_preprocessed = []
roadmap_preprocessed = []
for assembly in tqdm(assemblies, desc="Assembly"):
    for window in tqdm(windows,  desc="Window", dynamic_ncols=True):
        for center_enhancers in tqdm(centers, desc="Fantom", leave=False, dynamic_ncols=True):
            path = f"preprocessed/fantom/window_size_{window}/{assembly}"
            enhancer_path = f"{path}/enhancers_{center_enhancers}.bed.xz"
            promoter_path = f"{path}/promoters.bed.xz"
            if os.path.exists(enhancer_path):
                continue
            enhancers, promoters = fantom(
                common_cell_lines+fantom_cell_lines,
                window,
                
                center_enhancers=center_enhancers
            )
            os.makedirs(path, exist_ok=True)
            enhancers.to_csv(enhancer_path, index=None, sep="\t")
            enhancers.to_csv(promoter_path, index=None, sep="\t")
        fantom_preprocessed.append({
            "Nucleotides window": window,
            "Genome":assembly,
            "Region-centered enhancers": url_pattern.format(url=enhancer_path),
            "Peak-centered enhancers": url_pattern.format(url=enhancer_path),
            "Promoters": url_pattern.format(url=promoter_path)
        })
        if compute_roadmap:
            new_roadmap = {}
            for state in tqdm(states, desc="Roadmap", leave=False, dynamic_ncols=True):
                path = f"preprocessed/roadmap/window_size_{window}/state_{state}/{assembly}"
                enhancer_path = f"{path}/enhancers.bed.xz"
                promoter_path = f"{path}/promoters.bed.xz"
                new_roadmap.update({
                    "Nucleotides window": window,
                    "Genome":assembly,
                    f"{state}-states model enhancers": url_pattern.format(url=enhancer_path),
                    f"{state}-states model promoters": url_pattern.format(url=promoter_path)
                })
                if os.path.exists(path):
                    continue
                enhancers, promoters = roadmap(common_cell_lines+roadmap_cell_lines, window, states=state)
                os.makedirs(path, exist_ok=True)
                enhancers.to_csv(enhancer_path, index=None, sep="\t")
                enhancers.to_csv(promoter_path, index=None, sep="\t")
            roadmap_preprocessed.append(new_roadmap)
            
if compute_roadmap:
    df = pd.DataFrame(roadmap_preprocessed)[[
        "Nucleotides window",
        "Genome",
        "15-states model enhancers",
        "15-states model promoters",
        "18-states model enhancers",
        "18-states model promoters",
    ]]
    with open("roadmap.rst", "w") as f:
        f.write(tabulate(df.values, headers=df.columns, tablefmt="grid"))

df = pd.DataFrame(fantom_preprocessed)[[
    "Nucleotides window",
    "Genome",
    "Region-centered enhancers",
    "Peak-centered enhancers",
    "Promoters"
]]
with open("fantom.rst", "w") as f:
    f.write(tabulate(df.values, headers=df.columns, tablefmt="grid"))

HBox(children=(FloatProgress(value=0.0, description='Assembly', max=2.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='Window', layout=Layout(flex='2'), max=5.0, style=Progress…

HBox(children=(FloatProgress(value=0.0, description='Fantom', layout=Layout(flex='2'), max=2.0, style=Progress…

HBox(children=(FloatProgress(value=0.0, description='Fantom', layout=Layout(flex='2'), max=2.0, style=Progress…

HBox(children=(FloatProgress(value=0.0, description='Fantom', layout=Layout(flex='2'), max=2.0, style=Progress…

HBox(children=(FloatProgress(value=0.0, description='Fantom', layout=Layout(flex='2'), max=2.0, style=Progress…

HBox(children=(FloatProgress(value=0.0, description='Fantom', layout=Layout(flex='2'), max=2.0, style=Progress…




HBox(children=(FloatProgress(value=0.0, description='Window', layout=Layout(flex='2'), max=5.0, style=Progress…

HBox(children=(FloatProgress(value=0.0, description='Fantom', layout=Layout(flex='2'), max=2.0, style=Progress…

HBox(children=(FloatProgress(value=0.0, description='Fantom', layout=Layout(flex='2'), max=2.0, style=Progress…

HBox(children=(FloatProgress(value=0.0, description='Fantom', layout=Layout(flex='2'), max=2.0, style=Progress…

HBox(children=(FloatProgress(value=0.0, description='Fantom', layout=Layout(flex='2'), max=2.0, style=Progress…

HBox(children=(FloatProgress(value=0.0, description='Fantom', layout=Layout(flex='2'), max=2.0, style=Progress…



