In [None]:
# OLD CODE
import subprocess
import itertools
import os
import h5torch
from queue import Queue
from threading import Thread
import csv

#######! PARAMETERS - ONLY MODIFY THINGS HERE ########
# data folder
datafolder = "/data/home/natant/Negatives/Data/Encode690/ENCODE_hg38_subset_101bp_celltypes_ATAC_H5_all_chr"

# Define the cell types
cell_types = ["K562", "GM12878", "HepG2", "A549"]

# define the negative sampling modes
negative_sampling_modes = ["dinucl_sampled", "dinucl_shuffled", "shuffled", "neighbors", "celltype"]

# Limit the number of concurrent processes
max_concurrent_models = 4

# Define the output directory
output_dir = "/data/home/natant/Negatives/Runs/full_run_3"

# Define the group name
group_name = "full_run_3_DEBUG"
##########################!#############################



# Initialize an empty dictionary to store cell types and their corresponding TFs
cell_types_dict = {}

# Iterate through all files in the datafolder
for file_name in os.listdir(datafolder):
    if file_name.endswith(".h5t"):  # Check if the file is an .h5t file
        cell_type = file_name.split(".")[0]  # Extract the cell type from the file name
        file_path = os.path.join(datafolder, file_name)
        
        # Open the .h5t file and extract TFs
        with h5torch.File(file_path, 'r') as h5_file:
            prot_names = h5_file["0/prot_names"][:]
            tf_list = [name.decode('utf-8') for name in prot_names if name.decode('utf-8') != "ATAC_peak"]
        
        # Add the cell type and its TFs to the dictionary
        cell_types_dict[cell_type] = tf_list




total_combos = sum(len(tfs) for tfs in cell_types_dict.values())
print(f"Total TF-cell type combinations: {total_combos}")

  # Add more cell types as needed
# Check if all cell types are used
unused_cell_types = [ct for ct in cell_types if ct not in cell_types_dict]
if unused_cell_types:
    print(f"\033[91mWarning: The following cell types are not used in this debugging mode: {', '.join(unused_cell_types)}\033[0m")



# Generate combinations of cell types, their TFs, and negative sampling modes
cell_tf_neg_combinations = [
    (cell_type, tf, neg_mode, i)
    for cell_type, tfs in cell_types_dict.items()
    if cell_type in cell_types
    for tf in tfs
    for neg_mode in negative_sampling_modes
    for i in range(6)
]


# Define the CSV file path
csv_file_path = os.path.join(output_dir, "model_combinations.csv")

# Write the combinations to the CSV file
with open(csv_file_path, mode='w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    # Write the header
    csv_writer.writerow(["Cell Type", "TF", "Negative Sampling Mode", "Cross Val Fold"])
    # Write each combination
    for combination in cell_tf_neg_combinations:
        csv_writer.writerow(combination)

print(f"CSV file with model combinations written to: {csv_file_path}")
# Create a queue to hold the combinations
queue = Queue()

# Populate the queue with cell_tf_neg_combinations
for combination in cell_tf_neg_combinations:
    queue.put(combination)

# Function to process combinations from the queue
def worker():
    while not queue.empty():
        cell_type, tf, neg_mode, set = queue.get()
        command = [
            "python", 
            "/data/home/natant/Negatives/TFBS_negatives/utils/train_model.py",
            "--datafolder", datafolder,
            "--TF", tf, 
            "--celltype", cell_type, 
            "--neg_mode", neg_mode, 
            "--devices", "1",
            "--cross_val_set", str(set),
            "--learning_rate", "0.0001",
            "--n_blocks", "2",
            "--target_hsize", "128",
            "--batch_size", "256",
            "--output_dir", output_dir,
            "--early_stop_patience", "30",
            "--early_stop_metric", "AUROC",
            "--early_stop_mode", "max",
            "--group_name", group_name,
            "--test"
        ]
        subprocess.run(command)
        queue.task_done()

# Create and start threads
threads = []
for _ in range(max_concurrent_models):
    t = Thread(target=worker)
    t.start()
    threads.append(t)

# Wait for all threads to finish
for t in threads:
    t.join()



In [3]:
import pandas as pd
import subprocess
import itertools
import os
import h5torch
from queue import Queue
from threading import Thread
import csv

# Load the missing runs from the CSV file
missing_runs_path = "/data/home/natant/Negatives/Runs/full_run_3_missing/missing_runs.csv"
missing_runs_df = pd.read_csv(missing_runs_path)

In [4]:
missing_runs_df

Unnamed: 0,TF,celltype,neg_mode,cross_val_set,group_name
0,FOXA1_(SC-101058),A549,dinucl_shuffled,5,full_run_3
1,FOXA1_(SC-101058),A549,shuffled,3,full_run_3
2,Znf143_(16618-1-AP),GM12878,neighbors,1,full_run_3
3,Znf143_(16618-1-AP),GM12878,neighbors,2,full_run_3
4,Znf143_(16618-1-AP),GM12878,shuffled,1,full_run_3
5,Znf143_(16618-1-AP),GM12878,shuffled,2,full_run_3
6,Znf143_(16618-1-AP),GM12878,shuffled,3,full_run_3
7,Znf143_(16618-1-AP),GM12878,shuffled,4,full_run_3
8,Znf143_(16618-1-AP),GM12878,shuffled,5,full_run_3
9,MYBL2_(SC-81192),HepG2,dinucl_shuffled,0,full_run_3


In [None]:
import pandas as pd
import subprocess
import itertools
import os
import h5torch
from queue import Queue
from threading import Thread
import csv

# Load the missing runs from the CSV file
missing_runs_path = "/data/home/natant/Negatives/Runs/full_run_3_missing/missing_runs.csv"
missing_runs_df = pd.read_csv(missing_runs_path)

# Create a queue for the missing runs
missing_queue = Queue()

# Convert each row to a tuple and add to the queue
for _, row in missing_runs_df.iterrows():
    combination = (row["Cell Type"], row["TF"], row["Negative Sampling Mode"], row["Cross Val Fold"])
    missing_queue.put(combination)

print(f"Loaded {missing_queue.qsize()} missing runs to process")

# Define worker function specifically for missing runs
def missing_worker():
    while not missing_queue.empty():
        cell_type, tf, neg_mode, set_idx = missing_queue.get()
        print(f"Processing missing run: {cell_type}, {tf}, {neg_mode}, {set_idx}")
        
        command = [
            "python", 
            "/data/home/natant/Negatives/TFBS_negatives/utils/train_model.py",
            "--datafolder", datafolder,
            "--TF", tf, 
            "--celltype", cell_type, 
            "--neg_mode", neg_mode, 
            "--devices", "1",
            "--cross_val_set", str(set_idx),
            "--learning_rate", "0.0001",
            "--n_blocks", "2",
            "--target_hsize", "128",
            "--batch_size", "256",
            "--output_dir", output_dir,
            "--early_stop_patience", "30",
            "--early_stop_metric", "AUROC",
            "--early_stop_mode", "max",
            "--group_name", f"{group_name}_recovery",
            "--test"
        ]
        subprocess.run(command)
        missing_queue.task_done()
        print(f"Completed missing run: {cell_type}, {tf}, {neg_mode}, {set_idx}")

# Create and start threads for processing missing runs
missing_threads = []
for _ in range(max_concurrent_models):
    t = Thread(target=missing_worker)
    t.daemon = True
    t.start()
    missing_threads.append(t)

# Wait for all threads to finish
for t in missing_threads:
    t.join()

print("All missing runs have been processed!")