In [1]:
import pandas as pd
import numpy as np
import h5torch
import py2bit
from tqdm import tqdm
from tqdm import trange
import os

In [4]:
# Open the H5torch file in read mode
h5t_file = "/data/home/natant/Negatives/testing_ground/20250402_test.h5t"


In [4]:
# temp func ins
h5t_loc = "/data/home/natant/Negatives/testing_ground/20250404_temp"
out_folder = "/data/home/natant/Negatives/testing_ground/20250404_temp"


if not os.path.exists(h5t_loc):
    raise FileNotFoundError(f"The folder {h5t_loc} does not exist.")

h5t_files = [os.path.join(h5t_loc, file) for file in os.listdir(h5t_loc) if file.endswith(".h5t")]

for h5t_file in h5t_files:
    celltype = os.path.splitext(os.path.basename(h5t_file))[0]
    print(f"Processing file: {h5t_file}")
    with h5torch.File(h5t_file, "r") as f:
        genome = {k : f["unstructured"][k] for k in list(f["unstructured"]) if k.startswith("chr")}

        prot_names = [name.decode("utf-8") for name in f["0/prot_names"]]
        if "ATAC_peak" not in prot_names:
            raise ValueError("ATAC_peak not found in prot_names.")

        # Exclude "ATAC_peak" explicitly
        for i, TF in enumerate(tqdm(prot_names)):
            if TF == "ATAC_peak":
                continue  # Skip ATAC_peak

            index = i
            pos_indices = np.where(f["central"][index, :] == 1)[0]

            # Write the positive sequences to a BED file
            output_path = os.path.join(out_folder, f"{celltype}_{TF}_positives.bed")
            with open(output_path, "w") as bed_file:
                for j in tqdm(pos_indices):
                    chr = f["1/peak_ix_to_chr"][:][j].astype(str)
                    pos = f["1/peak_ix_to_pos"][:][j]
                    start = pos - 50
                    end = pos + 51
                    bed_file.write(f"{chr}\t{start}\t{end}\t{TF}\n")




Processing file: /data/home/natant/Negatives/testing_ground/20250404_temp/MCF-7.h5t


  0%|          | 0/5 [00:00<?, ?it/s]

 20%|█▉        | 4102/20998 [00:11<00:46, 365.59it/s]
 20%|██        | 1/5 [00:11<00:44, 11.23s/it]


KeyboardInterrupt: 

chr1    101
chr1    101
chr1    101
chr1    101
chr1    101
       ... 
chrX    101
chrX    101
chrX    101
chrX    101
chrY    101
Length: 6588, dtype: int64

In [None]:
import pandas as pd

# The bed files with the negative samples have the following naming scheme: # {celltype}_{TF}_negatives.bed
negative_bed_files = [file for file in os.listdir(out_folder) if file.endswith("_negatives.bed")]

for h5t_file in h5t_files:
    celltype = os.path.splitext(os.path.basename(h5t_file))[0]
    print(f"Checking for negative bed files for cell type: {celltype}")
    
    matching_files = [file for file in negative_bed_files if file.startswith(celltype)]
    for bed_file_name in matching_files:
        bed_file_path = os.path.join(out_folder, bed_file_name)
        print(f"Processing negative bed file: {bed_file_path}")
        
        chromosomes = []
        centers = []
        lengths = []
        
        with open(bed_file_path, "r") as bed_file:
            temp_file = pd.read_csv(bed_file_path, sep="\t", header=None, names=["chromosome", "start", "end"])
            chromosomes = temp_file["chromosome"].tolist()
            #! filter out the special chromosomes??????? I think it's quite a large precentage....? Should I just keep them in?
            centers = ((temp_file["start"] + temp_file["end"]) // 2).tolist()
            lengths = (temp_file["end"] - temp_file["start"]).tolist() #! the weird R script returns 100bp negatives for 101bp positives for some reason??????
            
            with h5torch.File(h5t_file, "a") as f:   
                f.register(
                    np.stack(centers),
                    axis="unstructured",
                    name=f"sampled_negs_{TF}_pos",
                    mode="N-D",
                    dtype_save="int8",
                    dtype_load="int8",
                )
                f.register(
                    np.array(chromosomes).astype(bytes),
                    axis="unstructured",
                    name=f"sampled_negs_{TF}_chr",
                    mode="N-D",
                    dtype_save="bytes",
                    dtype_load="str",
                )
                f.register(
                    np.array(lengths),
                    axis="unstructured",
                    name=f"sampled_negs_{TF}_len",
                    mode="N-D",
                    dtype_save="int8",
                    dtype_load="int8",
                )


Checking for negative bed files for cell type: MCF-7
Checking for negative bed files for cell type: IMR90
Processing negative bed file: /data/home/natant/Negatives/testing_ground/20250404_temp/IMR90_MafK_(ab50322)_negatives.bed


TypeError: cannot convert the series to <class 'int'>

In [10]:
temp_file["end"] - temp_file["start"]

0        100
1        100
2        100
3        100
4        100
        ... 
39758    100
39759    100
39760    100
39761    100
39762    100
Length: 39763, dtype: int64

In [15]:
os.path.splitext(os.path.basename(h5t_file))[0]

'20250402_test_longer'

In [9]:
"".join([rev_mapping[l] for l in DNA_region_pos])

'TAATCTATAAAGTACTCTTACAAGTCCATATTGCCCAATAAATAAATAAGCAGTGGATATGAACAGGCTGTTCATATAGAAAGAAATTCCAACAAACACAT'

In [6]:
prot_names

['ATAC_peak',
 'CTCF',
 'YY1_(SC-281)',
 'CREB1_(SC-240)',
 'Max',
 'TCF12',
 'FOSL2',
 'ELF1_(SC-631)',
 'BHLHE40',
 'ATF3',
 'USF-1',
 'ETS1',
 'SIX5',
 'ZBTB33',
 'FOXA1_(SC-101058)']