In [1]:
import os
import h5torch
import numpy as np
import requests

In [8]:
data_folder = "/data/home/natant/Negatives/Data/Encode690/ENCODE_hg38_subset_101bp_celltypes_ATAC_H5_all_chr"
h5t_files = [f for f in os.listdir(data_folder) if f.endswith('.h5t')]
dicts = {}
for h5t_file in h5t_files:
    file_path = os.path.join(data_folder, h5t_file)
    file = h5torch.File(file_path, 'r')
    prot_names = file["0/prot_names"][:]
    celltype = h5t_file.split('.')[0]
    tf_stats = {}
    for i, name in enumerate(prot_names):  
        if name != b'ATAC_peak':
            TF = name.decode()
            prot_mask = file["0/prot_names"][:] == TF.encode()
            central = file["central"][prot_mask].squeeze()
            num_positives = sum(central==1)
            neg_pos = file["unstructured/sampled_negs_"+TF+"_pos"]
            neg_chrs = file["unstructured/sampled_negs_"+TF+"_chr"]
            num_negatives = neg_pos.shape[0]


            tf_stats[TF] = (int(num_positives), int(num_negatives))
    dicts[celltype] = tf_stats

In [10]:
dicts

{'MCF-7': {'CTCF': (21001, 15921),
  'TCF7L2': (10297, 10291),
  'ZNF217': (9945, 9938),
  'GATA3_(SC-268)': (6083, 6083)},
 'GM12878': {'CTCF': (40107, 23347),
  'YY1_(SC-281)': (30998, 16081),
  'TBP': (14890, 12770),
  'Egr-1': (16324, 6557),
  'Mxi1_(AF4185)': (17747, 10944),
  'SRF': (8546, 6228),
  'MAZ_(ab85725)': (18972, 8941),
  'ELK1_(1277-1)': (5584, 4075),
  'SIX5': (4843, 3696),
  'USF-1': (9779, 6507),
  'SP1': (18248, 12275),
  'RFX5_(200-401-194)': (4340, 3885),
  'ELF1_(SC-631)': (22998, 7270),
  'ATF2_(SC-81188)': (23490, 22351),
  'NF-YB': (13305, 11134),
  'USF2': (9028, 7511),
  'Znf143_(16618-1-AP)': (20018, 13344),
  'ZEB1_(SC-25388)': (4842, 3264),
  'Pbx3': (9935, 8655),
  'MEF2A': (17612, 16883),
  'TCF12': (20437, 13228),
  'Max': (12553, 6947),
  'STAT5A_(SC-74442)': (7432, 7171),
  'NFIC_(SC-81335)': (29066, 28238),
  'Nrf1': (5689, 2505),
  'CEBPB_(SC-150)': (5798, 5788),
  'FOXM1_(SC-502)': (22935, 19943),
  'RXRA': (1705, 1588),
  'ZBTB33': (2144, 1996),

In [12]:
# Calculate negative/positive ratios for each TF in each cell type
ratios = {}
for celltype, tf_data in dicts.items():
    ratios[celltype] = {}
    for tf, (positives, negatives) in tf_data.items():
        if positives > 0:  # Avoid division by zero
            ratio = negatives / positives
            ratios[celltype][tf] = ratio
        else:
            ratios[celltype][tf] = float('inf')  # Handle case where positives = 0

ratios

{'MCF-7': {'CTCF': 0.7581067568211037,
  'TCF7L2': 0.9994173060114596,
  'ZNF217': 0.9992961287078934,
  'GATA3_(SC-268)': 1.0},
 'GM12878': {'CTCF': 0.5821178347919316,
  'YY1_(SC-281)': 0.51877540486483,
  'TBP': 0.857622565480188,
  'Egr-1': 0.4016785101690762,
  'Mxi1_(AF4185)': 0.6166676057925283,
  'SRF': 0.728761993915282,
  'MAZ_(ab85725)': 0.4712734556188067,
  'ELK1_(1277-1)': 0.7297636103151862,
  'SIX5': 0.763163328515383,
  'USF-1': 0.6654054606810512,
  'SP1': 0.6726764576939939,
  'RFX5_(200-401-194)': 0.8951612903225806,
  'ELF1_(SC-631)': 0.3161144447343247,
  'ATF2_(SC-81188)': 0.9515112813963389,
  'NF-YB': 0.8368282600526118,
  'USF2': 0.8319672131147541,
  'Znf143_(16618-1-AP)': 0.6666000599460485,
  'ZEB1_(SC-25388)': 0.6741016109045849,
  'Pbx3': 0.8711625566180171,
  'MEF2A': 0.9586077674312968,
  'TCF12': 0.6472574252581103,
  'Max': 0.5534135266470166,
  'STAT5A_(SC-74442)': 0.964881593110872,
  'NFIC_(SC-81335)': 0.9715131080988096,
  'Nrf1': 0.44032343118298

In [13]:
# Calculate mean of negative/positive ratios across all TFs and cell types
all_ratios = []
for celltype, tf_data in ratios.items():
    for tf, ratio in tf_data.items():
        if ratio != float('inf'):  # Exclude infinite ratios
            all_ratios.append(ratio)

mean_ratio = np.mean(all_ratios)
print(f"Mean negative/positive ratio: {mean_ratio:.4f}")

Mean negative/positive ratio: 0.7830


In [4]:
dicts.keys()

dict_keys(['MCF-7', 'GM12878', 'HepG2', 'IMR90', 'HCT-116', 'PANC-1', 'K562', 'HEK293', 'A549'])

In [5]:
data_folder = "/data/home/natant/Negatives/Data/Encode690/ENCODE_hg38_subset_101bp_celltypes_ATAC_H5_all_chr copy"
h5t_files = [f for f in os.listdir(data_folder) if f.endswith('.h5t')]
dicts = {}
for h5t_file in h5t_files:
    celltype = h5t_file.split('.')[0]
    if celltype in ['GM12878', 'HepG2', 'K562', 'A549']:
        file_path = os.path.join(data_folder, h5t_file)
        file = h5torch.File(file_path, 'r')
        prot_names = file["0/prot_names"][:]
        celltype = h5t_file.split('.')[0]
        tf_stats = {}
        for i, name in enumerate(prot_names):  
            if name != b'ATAC_peak':
                TF = name.decode()
                prot_mask = file["0/prot_names"][:] == TF.encode()
                central = file["central"][prot_mask].squeeze()
                num_positives = sum(central==1)
                neg_pos = file["unstructured/ct_sampled_"+TF+"_neg_indices"]
                num_negatives = neg_pos.shape[0]


                tf_stats[TF] = (int(num_positives), int(num_negatives))
        dicts[celltype] = tf_stats

In [6]:
dicts

{'GM12878': {'CTCF': (40107, 40107),
  'YY1_(SC-281)': (30998, 30998),
  'TBP': (14890, 14890),
  'Egr-1': (16324, 16324),
  'Mxi1_(AF4185)': (17747, 17747),
  'SRF': (8546, 8546),
  'MAZ_(ab85725)': (18972, 18972),
  'ELK1_(1277-1)': (5584, 5584),
  'SIX5': (4843, 4843),
  'USF-1': (9779, 9779),
  'SP1': (18248, 18248),
  'RFX5_(200-401-194)': (4340, 4340),
  'ELF1_(SC-631)': (22998, 22998),
  'ATF2_(SC-81188)': (23490, 23490),
  'NF-YB': (13305, 13305),
  'USF2': (9028, 9028),
  'Znf143_(16618-1-AP)': (20018, 20018),
  'ZEB1_(SC-25388)': (4842, 4842),
  'Pbx3': (9935, 9935),
  'MEF2A': (17612, 17612),
  'TCF12': (20437, 20437),
  'Max': (12553, 12553),
  'STAT5A_(SC-74442)': (7432, 7432),
  'NFIC_(SC-81335)': (29066, 29066),
  'Nrf1': (5689, 5689),
  'CEBPB_(SC-150)': (5798, 5798),
  'FOXM1_(SC-502)': (22935, 22935),
  'RXRA': (1705, 1705),
  'ZBTB33': (2144, 2144),
  'ETS1': (4114, 4114),
  'ATF3': (1675, 1675),
  'NF-YA': (1841, 1841),
  'IKZF1_(IkN)_(UCLA)': (9066, 9066),
  'JunD'