In [1]:
import os
import h5torch
import numpy as np
import requests

In [25]:
data_folder = "/data/home/natant/Negatives/Data/Encode690/ENCODE_hg38_subset_101bp_celltypes_ATAC_H5_all_chr copy"
h5t_files = [f for f in os.listdir(data_folder) if f.endswith('.h5t')]
dicts = {}
for h5t_file in h5t_files:
    file_path = os.path.join(data_folder, h5t_file)
    file = h5torch.File(file_path, 'r')
    prot_names = file["0/prot_names"][:]
    central = file["central"][:]
    celltype = h5t_file.split('.')[0]
    atac_idx = np.where(prot_names == b"ATAC_peak")[0]
    central_no_atac = np.delete(central, atac_idx, axis=0)
    mask_all_zeros = (central_no_atac == 0).all(axis=0)

    filtered_no_ATAC = central_no_atac[:, ~mask_all_zeros]

    # Count 0's and 1's for each row
    zeros_per_row = (filtered_no_ATAC == 0).sum(axis=1)
    ones_per_row = (filtered_no_ATAC == 1).sum(axis=1)

    tf_stats = {}
    for i, name in enumerate(prot_names[1:]):  # skip ATAC_peak
        tf_name = name.decode()
        num_positives = ones_per_row[i]
        num_negatives = zeros_per_row[i]
        tf_stats[tf_name] = (int(num_positives), int(num_negatives))
    dicts[celltype] = tf_stats








In [26]:
dicts

{'MCF-7': {'CTCF': (21001, 36689),
  'TCF7L2': (10297, 405259),
  'ZNF217': (9945, 404761),
  'GATA3_(SC-268)': (6083, 415270)},
 'GM12878': {'CTCF': (40107, 398210),
  'YY1_(SC-281)': (30998, 349533),
  'TBP': (14890, 432228),
  'Egr-1': (16324, 424484),
  'Mxi1_(AF4185)': (17747, 403510),
  'SRF': (8546, 468662),
  'MAZ_(ab85725)': (18972, 395102),
  'ELK1_(1277-1)': (5584, 483172),
  'SIX5': (4843, 489275),
  'USF-1': (9779, 475641),
  'SP1': (18248, 376855),
  'RFX5_(200-401-194)': (4340, 504861),
  'ELF1_(SC-631)': (22998, 374019),
  'ATF2_(SC-81188)': (23490, 372754),
  'NF-YB': (13305, 474014),
  'USF2': (9028, 477902),
  'Znf143_(16618-1-AP)': (20018, 418516),
  'ZEB1_(SC-25388)': (4842, 506221),
  'Pbx3': (9935, 480355),
  'MEF2A': (17612, 408335),
  'TCF12': (20437, 394834),
  'Max': (12553, 438130),
  'STAT5A_(SC-74442)': (7432, 476836),
  'NFIC_(SC-81335)': (29066, 367313),
  'Nrf1': (5689, 496726),
  'CEBPB_(SC-150)': (5798, 484441),
  'FOXM1_(SC-502)': (22935, 363802),
  

In [27]:
import pandas as pd

# Create a table for each celltype
tf_tables = {}
for cell, stats in dicts.items():
    tf_names = list(stats.keys())
    positives = [stats[tf][0] for tf in tf_names]
    negatives = [stats[tf][1] for tf in tf_names]
    tf_tables[cell] = pd.DataFrame({'Positives': positives, 'Negatives': negatives}, index=tf_names)

# Example: display the table for A549
tf_tables['A549']

Unnamed: 0,Positives,Negatives
CTCF,45735,175687
YY1_(SC-281),10277,234169
CREB1_(SC-240),15907,210554
Max,9891,231598
TCF12,20906,190336
FOSL2,28763,183745
ELF1_(SC-631),8618,232142
BHLHE40,3125,260466
ATF3,6588,237017
USF-1,8448,209389


In [32]:
for celltype, table in tf_tables.items():
    print(f"Celltype: {celltype}")
    print(table)
    print("\n" + "-"*40 + "\n")

Celltype: MCF-7
                Positives  Negatives
CTCF                21001      36689
TCF7L2              10297     405259
ZNF217               9945     404761
GATA3_(SC-268)       6083     415270

----------------------------------------

Celltype: GM12878
                     Positives  Negatives
CTCF                     40107     398210
YY1_(SC-281)             30998     349533
TBP                      14890     432228
Egr-1                    16324     424484
Mxi1_(AF4185)            17747     403510
SRF                       8546     468662
MAZ_(ab85725)            18972     395102
ELK1_(1277-1)             5584     483172
SIX5                      4843     489275
USF-1                     9779     475641
SP1                      18248     376855
RFX5_(200-401-194)        4340     504861
ELF1_(SC-631)            22998     374019
ATF2_(SC-81188)          23490     372754
NF-YB                    13305     474014
USF2                      9028     477902
Znf143_(16618-1-AP)     

In [31]:
# Find celltype-TF combinations where positives > negatives
more_positives = []
for celltype, stats in dicts.items():
    for tf, (pos, neg) in stats.items():
        if pos > neg:
            more_positives.append((celltype, tf, pos, neg))

more_positives

[('IMR90', 'MafK_(ab50322)', 40790, 0),
 ('HCT-116', 'CTCF', 50289, 46588),
 ('PANC-1', 'TCF7L2', 13382, 0),
 ('HEK293', 'CTCF', 47319, 12222)]

Of those HEK is the only one used in training I think?