In [1]:
import h5torch
import numpy as np
import torch
import pytorch_lightning as pl
import warnings
from pytorch_lightning.callbacks import Callback

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
datafolder = "/data/home/natant/Negatives/Data/Encode690/ENCODE_hg38_subset_101bp_celltypes_ATAC_H5_all_chr"

In [3]:
import os
import h5torch

# Initialize an empty dictionary to store cell types and their corresponding TFs
cell_types_dict = {}

# Iterate through all files in the datafolder
for file_name in os.listdir(datafolder):
    if file_name.endswith(".h5t"):  # Check if the file is an .h5t file
        cell_type = file_name.split(".")[0]  # Extract the cell type from the file name
        file_path = os.path.join(datafolder, file_name)
        
        # Open the .h5t file and extract TFs
        with h5torch.File(file_path, 'r') as h5_file:
            prot_names = h5_file["0/prot_names"][:]
            tf_list = [name.decode('utf-8') for name in prot_names if name.decode('utf-8') != "ATAC_peak"]
        
        # Add the cell type and its TFs to the dictionary
        cell_types_dict[cell_type] = tf_list

print(cell_types_dict)

{'MCF-7': ['CTCF', 'TCF7L2', 'ZNF217', 'GATA3_(SC-268)'], 'GM12878': ['CTCF', 'YY1_(SC-281)', 'TBP', 'Egr-1', 'Mxi1_(AF4185)', 'SRF', 'MAZ_(ab85725)', 'ELK1_(1277-1)', 'SIX5', 'USF-1', 'SP1', 'RFX5_(200-401-194)', 'ELF1_(SC-631)', 'ATF2_(SC-81188)', 'NF-YB', 'USF2', 'Znf143_(16618-1-AP)', 'ZEB1_(SC-25388)', 'Pbx3', 'MEF2A', 'TCF12', 'Max', 'STAT5A_(SC-74442)', 'NFIC_(SC-81335)', 'Nrf1', 'CEBPB_(SC-150)', 'FOXM1_(SC-502)', 'RXRA', 'ZBTB33', 'ETS1', 'ATF3', 'NF-YA', 'IKZF1_(IkN)_(UCLA)', 'JunD', 'ZZZ3', 'ZNF274'], 'HepG2': ['ZBTB33', 'USF-1', 'SP1', 'FOXA1_(SC-101058)', 'CTCF', 'MafK_(ab50322)', 'MafF_(M8194)', 'FOSL2', 'YY1_(SC-281)', 'JunD', 'ELF1_(SC-631)', 'Mxi1_(AF4185)', 'ATF3', 'RFX5_(200-401-194)', 'Max', 'RXRA', 'ZBTB7A_(SC-34508)', 'MAZ_(ab85725)', 'TBP', 'TEAD4_(SC-101184)', 'CEBPB_(SC-150)', 'USF2', 'SRF', 'MYBL2_(SC-81192)', 'NFIC_(SC-81335)', 'ARID3A_(NB100-279)', 'CEBPD_(SC-636)', 'Nrf1', 'HSF1', 'TCF12', 'TCF7L2', 'BHLHE40', 'IRF3', 'ZNF274'], 'IMR90': ['MafK_(ab50322)'],

In [33]:
file = h5torch.File("/data/home/natant/Negatives/Data/Encode690/ENCODE_hg38_subset_101bp_celltypes_ATAC_H5_all_chr/A549.h5t", 'r')
central = file["central"][:]
peak_ix_to_pos = file["1/peak_ix_to_pos"][:]
peak_ix_to_len = file["1/peak_ix_to_len"][:]
peak_ix_to_chr = file["1/peak_ix_to_chr"][:]
prot_names = file["0/prot_names"][:]
prot_mask = prot_names!= b"ATAC_peak" # filter out the ATAC peaks entry

subset = ["chr1", "chr2", "chr3"] # STILL FILTER OUT THE ATAC PEAKS ENTRY
peak_mask = np.isin(peak_ix_to_chr.astype(str), subset)

In [34]:
keys = list(file['unstructured'].keys())

In [35]:
list(keys)

['chr1',
 'chr10',
 'chr10_GL383545v1_alt',
 'chr10_GL383546v1_alt',
 'chr10_KI270824v1_alt',
 'chr10_KI270825v1_alt',
 'chr11',
 'chr11_GL383547v1_alt',
 'chr11_JH159136v1_alt',
 'chr11_JH159137v1_alt',
 'chr11_KI270721v1_random',
 'chr11_KI270826v1_alt',
 'chr11_KI270827v1_alt',
 'chr11_KI270829v1_alt',
 'chr11_KI270830v1_alt',
 'chr11_KI270831v1_alt',
 'chr11_KI270832v1_alt',
 'chr11_KI270902v1_alt',
 'chr11_KI270903v1_alt',
 'chr11_KI270927v1_alt',
 'chr12',
 'chr12_GL383549v1_alt',
 'chr12_GL383550v2_alt',
 'chr12_GL383551v1_alt',
 'chr12_GL383552v1_alt',
 'chr12_GL383553v2_alt',
 'chr12_GL877875v1_alt',
 'chr12_GL877876v1_alt',
 'chr12_KI270833v1_alt',
 'chr12_KI270834v1_alt',
 'chr12_KI270835v1_alt',
 'chr12_KI270836v1_alt',
 'chr12_KI270837v1_alt',
 'chr12_KI270904v1_alt',
 'chr13',
 'chr13_KI270838v1_alt',
 'chr13_KI270839v1_alt',
 'chr13_KI270840v1_alt',
 'chr13_KI270841v1_alt',
 'chr13_KI270842v1_alt',
 'chr13_KI270843v1_alt',
 'chr14',
 'chr14_GL000009v2_random',
 'chr14_GL

In [13]:
import py2bit
tb = py2bit.open("/data/home/natant/plmBind/Data/hg38.2bit")
chromosomes = tb.chroms()
chr_allow_list = [chrom for chrom in chromosomes.keys() if 'chrUn' not in chrom and 'chrM' not in chrom]

In [14]:
chr_allow_list

['chr1',
 'chr10',
 'chr11',
 'chr11_KI270721v1_random',
 'chr12',
 'chr13',
 'chr14',
 'chr14_GL000009v2_random',
 'chr14_GL000225v1_random',
 'chr14_KI270722v1_random',
 'chr14_GL000194v1_random',
 'chr14_KI270723v1_random',
 'chr14_KI270724v1_random',
 'chr14_KI270725v1_random',
 'chr14_KI270726v1_random',
 'chr15',
 'chr15_KI270727v1_random',
 'chr16',
 'chr16_KI270728v1_random',
 'chr17',
 'chr17_GL000205v2_random',
 'chr17_KI270729v1_random',
 'chr17_KI270730v1_random',
 'chr18',
 'chr19',
 'chr1_KI270706v1_random',
 'chr1_KI270707v1_random',
 'chr1_KI270708v1_random',
 'chr1_KI270709v1_random',
 'chr1_KI270710v1_random',
 'chr1_KI270711v1_random',
 'chr1_KI270712v1_random',
 'chr1_KI270713v1_random',
 'chr1_KI270714v1_random',
 'chr2',
 'chr20',
 'chr21',
 'chr22',
 'chr22_KI270731v1_random',
 'chr22_KI270732v1_random',
 'chr22_KI270733v1_random',
 'chr22_KI270734v1_random',
 'chr22_KI270735v1_random',
 'chr22_KI270736v1_random',
 'chr22_KI270737v1_random',
 'chr22_KI270738v1_ra

In [15]:
genome = {}
mapping = {"A": 0, "T": 1, "C": 2, "G": 3, "N": 4}
for chr_ in chr_allow_list:
        genome[chr_] = np.array([mapping[bp] for bp in tb.sequence(chr_)], dtype="int8")

In [19]:
genome_keys = list(genome.keys())

In [20]:
all_in_genome_keys = all(chr_ in genome_keys for chr_ in chr_allow_list)
print(all_in_genome_keys)

True


In [26]:
all_in_keys = all(chr_ in keys for chr_ in chr_allow_list)
print(all_in_keys)

True


In [29]:
all(chr_ in genome_keys for chr_ in keys)


False

In [31]:
chrs_in_keys_not_in_genome_keys = [chr_ for chr_ in keys if chr_ not in genome_keys]
chrs_in_keys_not_in_genome_keys

['dinucl_ATF3_chrs',
 'dinucl_ATF3_seqs',
 'dinucl_BHLHE40_chrs',
 'dinucl_BHLHE40_seqs',
 'dinucl_CREB1_(SC-240)_chrs',
 'dinucl_CREB1_(SC-240)_seqs',
 'dinucl_CTCF_chrs',
 'dinucl_CTCF_seqs',
 'dinucl_ELF1_(SC-631)_chrs',
 'dinucl_ELF1_(SC-631)_seqs',
 'dinucl_ETS1_chrs',
 'dinucl_ETS1_seqs',
 'dinucl_FOSL2_chrs',
 'dinucl_FOSL2_seqs',
 'dinucl_FOXA1_(SC-101058)_chrs',
 'dinucl_FOXA1_(SC-101058)_seqs',
 'dinucl_Max_chrs',
 'dinucl_Max_seqs',
 'dinucl_SIX5_chrs',
 'dinucl_SIX5_seqs',
 'dinucl_TCF12_chrs',
 'dinucl_TCF12_seqs',
 'dinucl_USF-1_chrs',
 'dinucl_USF-1_seqs',
 'dinucl_YY1_(SC-281)_chrs',
 'dinucl_YY1_(SC-281)_seqs',
 'dinucl_ZBTB33_chrs',
 'dinucl_ZBTB33_seqs',
 'sampled_negs_ATF3_chr',
 'sampled_negs_ATF3_len',
 'sampled_negs_ATF3_pos',
 'sampled_negs_BHLHE40_chr',
 'sampled_negs_BHLHE40_len',
 'sampled_negs_BHLHE40_pos',
 'sampled_negs_CREB1_chr',
 'sampled_negs_CREB1_len',
 'sampled_negs_CREB1_pos',
 'sampled_negs_CTCF_chr',
 'sampled_negs_CTCF_len',
 'sampled_negs_CTCF_

In [32]:
problematic_chroms = ["chr1_GL383518v1_alt", "chr20_KI270870v1_alt", "chr6_GL000251v2_alt", "chr10_KI270825v1_alt", "chr16_KI270728v1_random"]

In [36]:
problematic_in_keys = all(chrom in keys for chrom in problematic_chroms)
print(problematic_in_keys)

True


In [37]:
genome = {k : file["unstructured"][k] for k in list(file["unstructured"]) if k.startswith("chr")}

In [39]:
all(chrom in list(genome.keys()) for chrom in problematic_chroms)

True