In [1]:
import os
import h5torch
import numpy as np
import requests

In [2]:
data_folder = "/data/home/natant/Negatives/Data/Encode690/ENCODE_hg38_subset_101bp_celltypes_ATAC_H5_all_chr copy"
h5t_files = [f for f in os.listdir(data_folder) if f.endswith('.h5t')]


In [15]:
import pandas as pd
used_celltypes = ["GM12878", "K562", "HepG2", "A549", "HEK293", "IMR90", "PANC-1"]
celltype_tfs = {}
for h5t_file in h5t_files:
    file_path = os.path.join(data_folder, h5t_file)
    file = h5torch.File(file_path, 'r')
    celltype = h5t_file.replace('.h5t', '')
    if celltype in used_celltypes:
        TFs = file["0/prot_names"][:]
        TFs = [tf.decode() for tf in TFs if tf != b'ATAC_peak']
        celltype_tfs[celltype] = TFs

In [19]:
celltype_tfs_df = pd.DataFrame([
    {"Cell Type": ct, "TFs": ", ".join(tfs), "count": len(tfs)}
    for ct, tfs in celltype_tfs.items()
])
celltype_tfs_df = celltype_tfs_df.sort_values("count", ascending=False).reset_index(drop=True)
celltype_tfs_df

Unnamed: 0,Cell Type,TFs,count
0,GM12878,"CTCF, YY1_(SC-281), TBP, Egr-1, Mxi1_(AF4185),...",36
1,K562,"ZBTB33, CTCF, Egr-1, MAZ_(ab85725), MafK_(ab50...",36
2,HepG2,"ZBTB33, USF-1, SP1, FOXA1_(SC-101058), CTCF, M...",34
3,A549,"CTCF, YY1_(SC-281), CREB1_(SC-240), Max, TCF12...",14
4,HEK293,"CTCF, TCF7L2",2
5,IMR90,MafK_(ab50322),1
6,PANC-1,TCF7L2,1


In [22]:
latex_code = celltype_tfs_df.to_latex(index=False, caption="Transcription factor (TF) counts for each cell type used in this study.", label="tab:celltype_tf_counts")
print(latex_code)

\begin{table}
\caption{Transcription factor (TF) counts for each cell type used in this study.}
\label{tab:celltype_tf_counts}
\begin{tabular}{llr}
\toprule
Cell Type & TFs & count \\
\midrule
GM12878 & CTCF, YY1_(SC-281), TBP, Egr-1, Mxi1_(AF4185), SRF, MAZ_(ab85725), ELK1_(1277-1), SIX5, USF-1, SP1, RFX5_(200-401-194), ELF1_(SC-631), ATF2_(SC-81188), NF-YB, USF2, Znf143_(16618-1-AP), ZEB1_(SC-25388), Pbx3, MEF2A, TCF12, Max, STAT5A_(SC-74442), NFIC_(SC-81335), Nrf1, CEBPB_(SC-150), FOXM1_(SC-502), RXRA, ZBTB33, ETS1, ATF3, NF-YA, IKZF1_(IkN)_(UCLA), JunD, ZZZ3, ZNF274 & 36 \\
K562 & ZBTB33, CTCF, Egr-1, MAZ_(ab85725), MafK_(ab50322), MafF_(M8194), Max, YY1_(SC-281), TBP, ATF3, JunD, RFX5_(200-401-194), SRF, ATF1_(06-325), SIX5, SP1, NF-YA, NF-YB, USF-1, Znf143_(16618-1-AP), ELF1_(SC-631), TEAD4_(SC-101184), CEBPB_(SC-150), FOSL1_(SC-183), ZNF274, SETDB1, ETS1, ZBTB7A_(SC-34508), NR2F2_(SC-271940), MEF2A, STAT5A_(SC-74442), Nrf1, Mxi1_(AF4185), ELK1_(1277-1), USF2, Bach1_(sc-14700) & 

In [13]:
celltype_tfs

{'GM12878': ['ATAC_peak',
  'CTCF',
  'YY1_(SC-281)',
  'TBP',
  'Egr-1',
  'Mxi1_(AF4185)',
  'SRF',
  'MAZ_(ab85725)',
  'ELK1_(1277-1)',
  'SIX5',
  'USF-1',
  'SP1',
  'RFX5_(200-401-194)',
  'ELF1_(SC-631)',
  'ATF2_(SC-81188)',
  'NF-YB',
  'USF2',
  'Znf143_(16618-1-AP)',
  'ZEB1_(SC-25388)',
  'Pbx3',
  'MEF2A',
  'TCF12',
  'Max',
  'STAT5A_(SC-74442)',
  'NFIC_(SC-81335)',
  'Nrf1',
  'CEBPB_(SC-150)',
  'FOXM1_(SC-502)',
  'RXRA',
  'ZBTB33',
  'ETS1',
  'ATF3',
  'NF-YA',
  'IKZF1_(IkN)_(UCLA)',
  'JunD',
  'ZZZ3',
  'ZNF274'],
 'HepG2': ['ATAC_peak',
  'ZBTB33',
  'USF-1',
  'SP1',
  'FOXA1_(SC-101058)',
  'CTCF',
  'MafK_(ab50322)',
  'MafF_(M8194)',
  'FOSL2',
  'YY1_(SC-281)',
  'JunD',
  'ELF1_(SC-631)',
  'Mxi1_(AF4185)',
  'ATF3',
  'RFX5_(200-401-194)',
  'Max',
  'RXRA',
  'ZBTB7A_(SC-34508)',
  'MAZ_(ab85725)',
  'TBP',
  'TEAD4_(SC-101184)',
  'CEBPB_(SC-150)',
  'USF2',
  'SRF',
  'MYBL2_(SC-81192)',
  'NFIC_(SC-81335)',
  'ARID3A_(NB100-279)',
  'CEBPD_(SC-636)

In [4]:
used_celltypes = ["GM12878", "K562", "HepG2", "A549", "HEK293", "IMR90", "PANC-1"]

In [5]:
celltype_tf_df = celltype_tf_df[celltype_tf_df['Celltype'].isin(used_celltypes)]

In [6]:
celltype_tf_df

Unnamed: 0,Celltype,TF_count
1,GM12878,36
2,HepG2,34
3,IMR90,1
5,PANC-1,1
6,K562,36
7,HEK293,2
8,A549,14
