In [1]:
datafolder = "/data/home/natant/Negatives/Data/Encode690/ENCODE_hg38_subset_101bp_celltypes_ATAC_H5"

In [2]:
import os
import h5torch

# Initialize an empty dictionary to store cell types and their corresponding TFs
cell_types_dict = {}

# Iterate through all files in the datafolder
for file_name in os.listdir(datafolder):
    if file_name.endswith(".h5t"):  # Check if the file is an .h5t file
        cell_type = file_name.split(".")[0]  # Extract the cell type from the file name
        file_path = os.path.join(datafolder, file_name)
        
        # Open the .h5t file and extract TFs
        with h5torch.File(file_path, 'r') as h5_file:
            prot_names = h5_file["0/prot_names"][:]
            tf_list = [name.decode('utf-8') for name in prot_names if name.decode('utf-8') != "ATAC_peak"]
        
        # Add the cell type and its TFs to the dictionary
        cell_types_dict[cell_type] = tf_list

print(cell_types_dict)

{'MCF-7': ['CTCF', 'TCF7L2', 'ZNF217', 'GATA3_(SC-268)'], 'GM12878': ['CTCF', 'YY1_(SC-281)', 'TBP', 'Egr-1', 'Mxi1_(AF4185)', 'SRF', 'MAZ_(ab85725)', 'ELK1_(1277-1)', 'SIX5', 'USF-1', 'SP1', 'RFX5_(200-401-194)', 'ELF1_(SC-631)', 'ATF2_(SC-81188)', 'NF-YB', 'USF2', 'Znf143_(16618-1-AP)', 'ZEB1_(SC-25388)', 'Pbx3', 'MEF2A', 'TCF12', 'Max', 'STAT5A_(SC-74442)', 'NFIC_(SC-81335)', 'Nrf1', 'CEBPB_(SC-150)', 'FOXM1_(SC-502)', 'RXRA', 'ZBTB33', 'ETS1', 'ATF3', 'NF-YA', 'IKZF1_(IkN)_(UCLA)', 'JunD', 'ZZZ3', 'ZNF274'], 'HepG2': ['ZBTB33', 'USF-1', 'SP1', 'FOXA1_(SC-101058)', 'CTCF', 'MafK_(ab50322)', 'MafF_(M8194)', 'FOSL2', 'YY1_(SC-281)', 'JunD', 'ELF1_(SC-631)', 'Mxi1_(AF4185)', 'ATF3', 'RFX5_(200-401-194)', 'Max', 'RXRA', 'ZBTB7A_(SC-34508)', 'MAZ_(ab85725)', 'TBP', 'TEAD4_(SC-101184)', 'CEBPB_(SC-150)', 'USF2', 'SRF', 'MYBL2_(SC-81192)', 'NFIC_(SC-81335)', 'ARID3A_(NB100-279)', 'CEBPD_(SC-636)', 'Nrf1', 'HSF1', 'TCF12', 'TCF7L2', 'BHLHE40', 'IRF3', 'ZNF274'], 'IMR90': ['MafK_(ab50322)'],

In [3]:
total_combos = sum(len(tfs) for tfs in cell_types_dict.values())
print(f"Total TF-cell type combinations: {total_combos}")

Total TF-cell type combinations: 132


In [4]:
cell_types_dict

{'MCF-7': ['CTCF', 'TCF7L2', 'ZNF217', 'GATA3_(SC-268)'],
 'GM12878': ['CTCF',
  'YY1_(SC-281)',
  'TBP',
  'Egr-1',
  'Mxi1_(AF4185)',
  'SRF',
  'MAZ_(ab85725)',
  'ELK1_(1277-1)',
  'SIX5',
  'USF-1',
  'SP1',
  'RFX5_(200-401-194)',
  'ELF1_(SC-631)',
  'ATF2_(SC-81188)',
  'NF-YB',
  'USF2',
  'Znf143_(16618-1-AP)',
  'ZEB1_(SC-25388)',
  'Pbx3',
  'MEF2A',
  'TCF12',
  'Max',
  'STAT5A_(SC-74442)',
  'NFIC_(SC-81335)',
  'Nrf1',
  'CEBPB_(SC-150)',
  'FOXM1_(SC-502)',
  'RXRA',
  'ZBTB33',
  'ETS1',
  'ATF3',
  'NF-YA',
  'IKZF1_(IkN)_(UCLA)',
  'JunD',
  'ZZZ3',
  'ZNF274'],
 'HepG2': ['ZBTB33',
  'USF-1',
  'SP1',
  'FOXA1_(SC-101058)',
  'CTCF',
  'MafK_(ab50322)',
  'MafF_(M8194)',
  'FOSL2',
  'YY1_(SC-281)',
  'JunD',
  'ELF1_(SC-631)',
  'Mxi1_(AF4185)',
  'ATF3',
  'RFX5_(200-401-194)',
  'Max',
  'RXRA',
  'ZBTB7A_(SC-34508)',
  'MAZ_(ab85725)',
  'TBP',
  'TEAD4_(SC-101184)',
  'CEBPB_(SC-150)',
  'USF2',
  'SRF',
  'MYBL2_(SC-81192)',
  'NFIC_(SC-81335)',
  'ARID3A_(NB

In [6]:
import subprocess
import itertools
import os

# Define the cell types
cell_types = ["MCF-7", "HCT-116"]  # Add more cell types as needed
negative_sampling_modes = ["dinucl_sampled", "dinucl_shuffled", "shuffled", "neighbors"]

In [7]:
# Create a list to hold all combinations of cell types and TFs
combinations = list(itertools.product(cell_types, negative_sampling_modes))

In [8]:
# Generate combinations of cell types, their TFs, and negative sampling modes
cell_tf_neg_combinations = [
    (cell_type, tf, neg_mode, i)
    for cell_type, tfs in cell_types_dict.items()
    if cell_type in cell_types
    for tf in tfs
    for neg_mode in negative_sampling_modes
    for i in range(6)
]
cell_tf_neg_combinations

[('MCF-7', 'CTCF', 'dinucl_sampled', 0),
 ('MCF-7', 'CTCF', 'dinucl_sampled', 1),
 ('MCF-7', 'CTCF', 'dinucl_sampled', 2),
 ('MCF-7', 'CTCF', 'dinucl_sampled', 3),
 ('MCF-7', 'CTCF', 'dinucl_sampled', 4),
 ('MCF-7', 'CTCF', 'dinucl_sampled', 5),
 ('MCF-7', 'CTCF', 'dinucl_shuffled', 0),
 ('MCF-7', 'CTCF', 'dinucl_shuffled', 1),
 ('MCF-7', 'CTCF', 'dinucl_shuffled', 2),
 ('MCF-7', 'CTCF', 'dinucl_shuffled', 3),
 ('MCF-7', 'CTCF', 'dinucl_shuffled', 4),
 ('MCF-7', 'CTCF', 'dinucl_shuffled', 5),
 ('MCF-7', 'CTCF', 'shuffled', 0),
 ('MCF-7', 'CTCF', 'shuffled', 1),
 ('MCF-7', 'CTCF', 'shuffled', 2),
 ('MCF-7', 'CTCF', 'shuffled', 3),
 ('MCF-7', 'CTCF', 'shuffled', 4),
 ('MCF-7', 'CTCF', 'shuffled', 5),
 ('MCF-7', 'CTCF', 'neighbors', 0),
 ('MCF-7', 'CTCF', 'neighbors', 1),
 ('MCF-7', 'CTCF', 'neighbors', 2),
 ('MCF-7', 'CTCF', 'neighbors', 3),
 ('MCF-7', 'CTCF', 'neighbors', 4),
 ('MCF-7', 'CTCF', 'neighbors', 5),
 ('MCF-7', 'TCF7L2', 'dinucl_sampled', 0),
 ('MCF-7', 'TCF7L2', 'dinucl_samp

In [9]:
len(cell_tf_neg_combinations)

192

In [10]:
cell_tf_neg_combinations

[('MCF-7', 'CTCF', 'dinucl_sampled', 0),
 ('MCF-7', 'CTCF', 'dinucl_sampled', 1),
 ('MCF-7', 'CTCF', 'dinucl_sampled', 2),
 ('MCF-7', 'CTCF', 'dinucl_sampled', 3),
 ('MCF-7', 'CTCF', 'dinucl_sampled', 4),
 ('MCF-7', 'CTCF', 'dinucl_sampled', 5),
 ('MCF-7', 'CTCF', 'dinucl_shuffled', 0),
 ('MCF-7', 'CTCF', 'dinucl_shuffled', 1),
 ('MCF-7', 'CTCF', 'dinucl_shuffled', 2),
 ('MCF-7', 'CTCF', 'dinucl_shuffled', 3),
 ('MCF-7', 'CTCF', 'dinucl_shuffled', 4),
 ('MCF-7', 'CTCF', 'dinucl_shuffled', 5),
 ('MCF-7', 'CTCF', 'shuffled', 0),
 ('MCF-7', 'CTCF', 'shuffled', 1),
 ('MCF-7', 'CTCF', 'shuffled', 2),
 ('MCF-7', 'CTCF', 'shuffled', 3),
 ('MCF-7', 'CTCF', 'shuffled', 4),
 ('MCF-7', 'CTCF', 'shuffled', 5),
 ('MCF-7', 'CTCF', 'neighbors', 0),
 ('MCF-7', 'CTCF', 'neighbors', 1),
 ('MCF-7', 'CTCF', 'neighbors', 2),
 ('MCF-7', 'CTCF', 'neighbors', 3),
 ('MCF-7', 'CTCF', 'neighbors', 4),
 ('MCF-7', 'CTCF', 'neighbors', 5),
 ('MCF-7', 'TCF7L2', 'dinucl_sampled', 0),
 ('MCF-7', 'TCF7L2', 'dinucl_samp

In [None]:
from queue import Queue
from threading import Thread

# Limit the number of concurrent processes
max_concurrent_models = 4


# Create a queue to hold the combinations
queue = Queue()

# Populate the queue with cell_tf_neg_combinations
for combination in cell_tf_neg_combinations:
    queue.put(combination)

# Function to process combinations from the queue
def worker():
    while not queue.empty():
        cell_type, tf, neg_mode, set = queue.get()
        command = [
            "python", 
            "/data/home/natant/Negatives/TFBS_negatives/utils/train_model.py", 
            "--TF", tf, 
            "--celltype", cell_type, 
            "--neg_mode", neg_mode, 
            "--devices", "3",
            "--cross_val_set", set
        ]
        subprocess.run(command)
        queue.task_done()

# Create and start threads
threads = []
for _ in range(max_concurrent_models):
    t = Thread(target=worker)
    t.start()
    threads.append(t)

# Wait for all threads to finish
for t in threads:
    t.join()



Exception in thread Thread-6 (worker):
Traceback (most recent call last):
  File "/data/home/natant/anaconda3/envs/Negs/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
Exception in thread Thread-7 (worker):
Traceback (most recent call last):
  File "/data/home/natant/anaconda3/envs/Negs/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
Exception in thread Thread-8 (worker):
Traceback (most recent call last):
  File "/data/home/natant/anaconda3/envs/Negs/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/data/home/natant/anaconda3/envs/Negs/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 766, in run_closure
Exception in thread Thread-9 (worker):
Traceback (most recent call last):
  File "/data/home/natant/anaconda3/envs/Negs/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/data/home/natant/anaconda3/envs/Negs/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 766, in run_closur