In [2]:
import pandas as pd
import numpy as np
import h5torch
import py2bit
from tqdm import tqdm
from tqdm import trange

In [3]:
# Open the H5torch file in read mode
file_path = "/data/home/natant/Negatives/testing_ground/20250402_test.h5t"
f =  h5torch.File(file_path, "r")

In [4]:
f["0/prot_names"][:] #! so the first one is the ATAC peaks. For theses we obviously don't need dincleotide shuffled sequences

array([b'ATAC_peak', b'CTCF', b'YY1_(SC-281)', b'CREB1_(SC-240)', b'Max',
       b'TCF12', b'FOSL2', b'ELF1_(SC-631)', b'BHLHE40', b'ATF3',
       b'USF-1', b'ETS1', b'SIX5', b'ZBTB33', b'FOXA1_(SC-101058)'],
      dtype='|S17')

In [6]:
import sys,string,random

############################################################################################################### TODO: ANY PROBLEMS GOING FROM PYTHON 2 TO PYTHON 3
### ALSO TODO: I ADDED THE "N" TO THE CODE? NO EXTRA PROBLEMS??? DOUBLE CHECK!!!

##### CODE FROM https://github.com/wassermanlab/BiasAway/blob/master/altschulEriksonDinuclShuffle.py 

def computeCountAndLists(s):
  #WARNING: Use of function count(s,'UU') returns 1 on word UUU
  #since it apparently counts only nonoverlapping words UU
  #For this reason, we work with the indices.

  #Initialize lists and mono- and dinucleotide dictionaries
  List = {} #List is a dictionary of lists
  List['A'] = []
  List['C'] = []
  List['G'] = []
  List['T'] = []
  List['N'] = []

  nuclList   = ["A","C","G","T","N"]
  s = s.upper()
  s = s.replace("T","T")
  nuclCnt = {}  #empty dictionary
  dinuclCnt = {}  #empty dictionary
  for x in nuclList:
    nuclCnt[x]=0
    dinuclCnt[x]={}
    for y in nuclList:
      dinuclCnt[x][y]=0

  #Compute count and lists
  nuclCnt[s[0]] = 1
  nuclTotal     = 1
  dinuclTotal   = 0
  for i in range(len(s)-1):
    x = s[i]; y = s[i+1]
    List[x].append( y )
    nuclCnt[y] += 1; nuclTotal  += 1
    dinuclCnt[x][y] += 1; dinuclTotal += 1
  assert (nuclTotal==len(s))
  assert (dinuclTotal==len(s)-1)
  return nuclCnt,dinuclCnt,List
 
 
def chooseEdge(x,dinuclCnt):
  numInList = 0
  for y in ['A','C','G','T','N']:
    numInList += dinuclCnt[x][y]
  z = random.random()
  denom=dinuclCnt[x]['A']+dinuclCnt[x]['C']+dinuclCnt[x]['G']+dinuclCnt[x]['T']+dinuclCnt[x]['N']
  numerator = dinuclCnt[x]['A']
  if z < float(numerator)/float(denom):
    dinuclCnt[x]['A'] -= 1
    return 'A'
  numerator += dinuclCnt[x]['C']
  if z < float(numerator)/float(denom):
    dinuclCnt[x]['C'] -= 1
    return 'C'
  numerator += dinuclCnt[x]['N']
  if z < float(numerator)/float(denom):
    dinuclCnt[x]['N'] -= 1
    return 'N'
  numerator += dinuclCnt[x]['G']
  if z < float(numerator)/float(denom):
    dinuclCnt[x]['G'] -= 1
    return 'G'
  dinuclCnt[x]['T'] -= 1
  return 'T'


def connectedToLast(edgeList,nuclList,lastCh):
  D = {}
  for x in nuclList: D[x]=0
  for edge in edgeList:
    a = edge[0]; b = edge[1]
    if b==lastCh: D[a]=1
  for i in range(2):
    for edge in edgeList:
      a = edge[0]; b = edge[1]
      if D[b]==1: D[a]=1
  ok = 0
  for x in nuclList:
    if x!=lastCh and D[x]==0: return 0
  return 1
 

def eulerian(s):
  nuclCnt,dinuclCnt,List = computeCountAndLists(s)
  #compute nucleotides appearing in s
  nuclList = []
  for x in ["A","C","G","T","N"]:
    if x in s: nuclList.append(x)
  #compute numInList[x] = number of dinucleotides beginning with x
  numInList = {}
  for x in nuclList:
    numInList[x]=0
    for y in nuclList:
      numInList[x] += dinuclCnt[x][y]
  #create dinucleotide shuffle L 
  firstCh = s[0]  #start with first letter of s
  lastCh  = s[-1]
  edgeList = []
  for x in nuclList:
    if x!= lastCh: edgeList.append( [x,chooseEdge(x,dinuclCnt)] )
  ok = connectedToLast(edgeList,nuclList,lastCh)
  return ok,edgeList,nuclList,lastCh


def shuffleEdgeList(L):
  n = len(L); barrier = n
  for i in range(n-1):
    z = int(random.random() * barrier)
    tmp = L[z]
    L[z]= L[barrier-1]
    L[barrier-1] = tmp
    barrier -= 1
  return L


def dinuclShuffle(s):
  ok = 0
  while not ok:
    ok,edgeList,nuclList,lastCh = eulerian(s)
  nuclCnt,dinuclCnt,List = computeCountAndLists(s)

  #remove last edges from each vertex list, shuffle, then add back
  #the removed edges at end of vertex lists.
  for [x,y] in edgeList: List[x].remove(y)
  for x in nuclList: shuffleEdgeList(List[x])
  for [x,y] in edgeList: List[x].append(y)

  #construct the eulerian path
  L = [s[0]]; prevCh = s[0]
  for i in range(len(s)-2):
    ch = List[prevCh][0] 
    L.append( ch )
    del List[prevCh][0]
    prevCh = ch
  L.append(s[-1])
  t = "".join(L)
  return t

In [8]:
mapping = {"A": 0, "T": 1, "C": 2, "G": 3, "N": 4}
rev_mapping = {v : k for k, v in mapping.items()}

genome = {k : f["unstructured"][k] for k in list(f["unstructured"]) if k.startswith("chr")}

for i, TF in enumerate(tqdm(f["0/prot_names"][1:])):
    index=i+1
    TF = TF.decode("utf-8")
    pos_indices = np.where(f["central"][index,:]==1)[0]

    dinucl_shuffled_seqs = []
    chr_list = []

    for j in tqdm(pos_indices):
        chr = f["1/peak_ix_to_chr"][:][j].astype(str)
        chr_list.append(chr)
        pos = f["1/peak_ix_to_pos"][:][j]
        DNA_region_pos = genome[chr][pos-50:pos+51] #! Is this correct???
        shuffled = dinuclShuffle("".join([rev_mapping[l] for l in DNA_region_pos]))
        encoded_shuffled = np.array([mapping[bp] for bp in shuffled], dtype="int8")
        dinucl_shuffled_seqs.append(encoded_shuffled)

    

100%|██████████| 5506/5506 [00:05<00:00, 1055.29it/s]
100%|██████████| 1158/1158 [00:01<00:00, 1068.55it/s]
100%|██████████| 1708/1708 [00:01<00:00, 1064.15it/s]
100%|██████████| 953/953 [00:00<00:00, 1032.97it/s]
100%|██████████| 2045/2045 [00:02<00:00, 979.14it/s] 
100%|██████████| 3189/3189 [00:02<00:00, 1080.03it/s]
100%|██████████| 892/892 [00:00<00:00, 1049.73it/s]
100%|██████████| 284/284 [00:00<00:00, 1084.81it/s]
100%|██████████| 650/650 [00:00<00:00, 1084.62it/s]
100%|██████████| 821/821 [00:00<00:00, 1072.57it/s]
100%|██████████| 561/561 [00:00<00:00, 1080.16it/s]
100%|██████████| 471/471 [00:00<00:00, 1075.68it/s]
100%|██████████| 754/754 [00:00<00:00, 982.48it/s]
100%|██████████| 771/771 [00:00<00:00, 916.61it/s]
100%|██████████| 14/14 [00:19<00:00,  1.36s/it]


In [11]:
np.stack(dinucl_shuffled_seqs).shape

(771, 101)

In [12]:
np.array(chr_list).shape

(771,)

In [None]:
mapping = {"A": 0, "T": 1, "C": 2, "G": 3, "N": 4}
rev_mapping = {v : k for k, v in mapping.items()}

genome = {k : f["unstructured"][k] for k in list(f["unstructured"]) if k.startswith("chr")}

dinucl_shuffled_seqs = []
chr_list = []

for j in tqdm(pos_indices):
    chr = f["1/peak_ix_to_chr"][:][j].astype(str)
    chr_list.append(chr)
    pos = f["1/peak_ix_to_pos"][:][j]
    DNA_region_pos = genome[chr][pos-50:pos+51] #! Is this correct???
    shuffled = dinuclShuffle("".join([rev_mapping[l] for l in DNA_region_pos]))
    encoded_shuffled = np.array([mapping[bp] for bp in shuffled], dtype="int8")
    dinucl_shuffled_seqs.append(encoded_shuffled)




100%|██████████| 771/771 [00:00<00:00, 1046.38it/s]


In [45]:
dinucl_shuffled_seqs

[array([1, 0, 0, 2, 1, 3, 3, 2, 3, 0, 0, 3, 3, 1, 1, 3, 1, 3, 1, 0, 0, 0,
        2, 2, 1, 3, 0, 1, 3, 0, 2, 0, 3, 1, 1, 3, 3, 0, 3, 3, 3, 3, 3, 1,
        3, 1, 0, 1, 0, 1, 1, 3, 1, 3, 1, 3, 0, 2, 1, 1, 1, 1, 3, 0, 1, 3,
        3, 0, 2, 1, 1, 1, 2, 1, 0, 1, 3, 1, 3, 1, 2, 1, 2, 0, 3, 0, 0, 1,
        1, 2, 0, 3, 3, 0, 3, 3, 0, 3, 0, 0, 2], dtype=int8),
 array([3, 3, 3, 2, 2, 0, 3, 3, 0, 0, 3, 0, 0, 0, 1, 0, 2, 2, 1, 0, 0, 3,
        0, 2, 2, 1, 3, 0, 2, 3, 3, 1, 0, 0, 1, 3, 2, 0, 3, 1, 0, 1, 0, 1,
        2, 0, 0, 2, 2, 0, 0, 0, 1, 3, 0, 0, 0, 1, 0, 2, 3, 0, 2, 0, 1, 3,
        0, 0, 3, 1, 3, 1, 1, 1, 1, 3, 2, 0, 0, 1, 1, 0, 0, 2, 1, 3, 1, 2,
        0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 3], dtype=int8),
 array([0, 3, 2, 2, 0, 3, 1, 0, 0, 3, 1, 3, 3, 1, 3, 0, 2, 1, 3, 2, 1, 0,
        3, 2, 0, 3, 0, 2, 0, 3, 2, 2, 0, 3, 0, 3, 1, 0, 3, 1, 3, 3, 2, 0,
        1, 1, 2, 2, 1, 0, 2, 1, 0, 2, 2, 0, 0, 0, 3, 0, 2, 1, 2, 1, 3, 0,
        3, 0, 3, 2, 2, 0, 1, 3, 0, 3, 1, 0, 3, 2, 1, 1, 1, 3, 1,

In [42]:
DNA_region_pos

array([1, 0, 0, 1, 2, 1, 0, 1, 0, 0, 0, 3, 1, 0, 2, 1, 2, 1, 1, 0, 2, 0,
       0, 3, 1, 2, 2, 0, 1, 0, 1, 1, 3, 2, 2, 2, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 3, 2, 0, 3, 1, 3, 3, 0, 1, 0, 1, 3, 0, 0, 2, 0, 3,
       3, 2, 1, 3, 1, 1, 2, 0, 1, 0, 1, 0, 3, 0, 0, 0, 3, 0, 0, 0, 1, 1,
       2, 2, 0, 0, 2, 0, 0, 0, 2, 0, 2, 0, 1], dtype=int8)

In [39]:
DNA_region_pos.shape

(100,)

In [None]:
#! OLD code
for chr_, pos_ in tqdm():
        DNA_region_pos = genome[chr_][pos_-1024:pos_+1024]
        shuffled = dinuclShuffle("".join([rev_mapping[l] for l in DNA_region_pos]))
        encoded_shuffled = np.array([mapping[bp] for bp in shuffled], dtype="int8")
        shuffled_negatives.append(encoded_shuffled)

    shuffled_negatives = np.stack(shuffled_negatives)

In [29]:
genome

{'chr1': <HDF5 dataset "chr1": shape (248956422,), type "|i1">,
 'chrX': <HDF5 dataset "chrX": shape (156040895,), type "|i1">}

In [21]:
f["1"].keys()

<KeysViewHDF5 ['peak_ix_to_chr', 'peak_ix_to_len', 'peak_ix_to_pos']>

In [27]:
f["unstructured"].keys()

<KeysViewHDF5 ['chr1', 'chrX']>

In [16]:
np.where(f["central"][1,:]==1)[0]

array([    8,     9,    23, ..., 48265, 48266, 48274])

In [15]:
f.close()

In [None]:
import os
def create_dinucl_shuffled_negatives(h5t_loc, num_negs):
    # Ensure the h5t_loc folder exists
    if not os.path.exists(h5t_loc):
        raise FileNotFoundError(f"The folder {h5t_loc} does not exist.")

    # Get all .h5t files in the folder
    h5t_files = [os.path.join(h5t_loc, file) for file in os.listdir(h5t_loc) if file.endswith(".h5t")]

    if not h5t_files:
        raise FileNotFoundError(f"No .h5t files found in the folder {h5t_loc}.")

    print(f"Found {len(h5t_files)} .h5t files in the folder {h5t_loc}.")

    mapping = {"A": 0, "T": 1, "C": 2, "G": 3, "N": 4}
    rev_mapping = {v : k for k, v in mapping.items()}

    for h5t_file in h5t_files:
        print(f"Processing file: {h5t_file}")
        with h5torch.File(h5t_file, "a") as f:
            genome = {k : f["unstructured"][k] for k in list(f["unstructured"]) if k.startswith("chr")}

            prot_names = [name.decode("utf-8") for name in f["0/prot_names"]]
            if "ATAC_peak" not in prot_names:
                raise ValueError("ATAC_peak not found in prot_names.")

            # Exclude "ATAC_peak" explicitly
            for i, TF in enumerate(tqdm(prot_names)):
                if TF == "ATAC_peak":
                    continue  # Skip ATAC_peak

                index = i
                pos_indices = np.where(f["central"][index, :] == 1)[0]

                dinucl_shuffled_seqs = []
                chr_list = []

                for j in tqdm(pos_indices):
                    chr = f["1/peak_ix_to_chr"][:][j].astype(str)
                    pos = f["1/peak_ix_to_pos"][:][j]
                    DNA_region_pos = genome[chr][pos - 50 : pos + 51]  #! Is this correct???
                    for i in range(num_negs):
                        shuffled = dinuclShuffle("".join([rev_mapping[l] for l in DNA_region_pos]))
                        encoded_shuffled = np.array([mapping[bp] for bp in shuffled], dtype="int8")
                        dinucl_shuffled_seqs.append(encoded_shuffled)
                        chr_list.append(chr)

                f.register(
                    np.stack(dinucl_shuffled_seqs),
                    axis="unstructured",
                    name=f"dinucl_{TF}_seqs",
                    mode="N-D",
                    dtype_save="int8",
                    dtype_load="int8",
                )

                f.register(
                    np.array(chr_list).astype(bytes),
                    axis="unstructured",
                    name=f"dinucl_{TF}_chrs",
                    mode="N-D",
                    dtype_save="bytes",
                    dtype_load="str",
                )

In [16]:
create_dinucl_shuffled_negatives('/data/home/natant/Negatives/testing_ground/20250402_test', 1)

Found 1 .h5t files in the folder /data/home/natant/Negatives/testing_ground/20250402_test.
Processing file: /data/home/natant/Negatives/testing_ground/20250402_test/20250402_test_longer.h5t


100%|██████████| 11876/11876 [00:13<00:00, 861.90it/s]
100%|██████████| 2461/2461 [00:02<00:00, 859.11it/s]
100%|██████████| 3690/3690 [00:04<00:00, 868.73it/s]
100%|██████████| 2219/2219 [00:02<00:00, 861.71it/s]
100%|██████████| 5149/5149 [00:06<00:00, 795.73it/s]
100%|██████████| 7686/7686 [00:08<00:00, 854.61it/s]
100%|██████████| 1970/1970 [00:02<00:00, 770.61it/s]
100%|██████████| 688/688 [00:00<00:00, 821.97it/s]
100%|██████████| 1596/1596 [00:01<00:00, 803.16it/s]
100%|██████████| 2034/2034 [00:02<00:00, 841.90it/s]
100%|██████████| 1240/1240 [00:01<00:00, 848.92it/s]
100%|██████████| 1082/1082 [00:01<00:00, 859.55it/s]
100%|██████████| 1712/1712 [00:02<00:00, 844.96it/s]
100%|██████████| 2084/2084 [00:02<00:00, 838.21it/s]
100%|██████████| 14/14 [00:54<00:00,  3.87s/it]
