In [1]:
from typing import Any
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub

def one_hot_encode(sequence: str,
                   alphabet: str = 'ACGT',
                   neutral_alphabet: str = 'N',
                   neutral_value: Any = 0,
                   dtype=np.float32) -> np.ndarray:
  """One-hot encode sequence."""
  def to_uint8(string):
    return np.frombuffer(string.encode('ascii'), dtype=np.uint8)
  hash_table = np.zeros((np.iinfo(np.uint8).max, len(alphabet)), dtype=dtype)
  hash_table[to_uint8(alphabet)] = np.eye(len(alphabet), dtype=dtype)
  hash_table[to_uint8(neutral_alphabet)] = neutral_value
  hash_table = hash_table.astype(dtype)
  return hash_table[to_uint8(sequence)]

2026-01-06 13:22:21.139147: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [2]:
gpu_devices = tf.config.list_physical_devices('GPU')
print("Num GPUs Available: ", len(gpu_devices))

if gpu_devices:
    print("GPU Details:", gpu_devices)
else:
    print("TensorFlow is NOT using the GPU.")

Num GPUs Available:  1
GPU Details: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


2026-01-06 13:22:27.537329: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2026-01-06 13:22:27.565620: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:3b:00.0 name: NVIDIA RTX A6000 computeCapability: 8.6
coreClock: 1.8GHz coreCount: 84 deviceMemorySize: 47.54GiB deviceMemoryBandwidth: 715.34GiB/s
2026-01-06 13:22:27.565657: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2026-01-06 13:22:27.774236: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2026-01-06 13:22:27.774326: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11
2026-01-06 13:22:27.842019: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10

In [37]:
import pandas as pd
import json


species = 'mouse'
narrowPeak_file = species + '_liver_pos_ALL.narrowPeak'

chrom_dir = '/home/azstephe/liverRegression/regression_liver/data/chrom_sizes/'
chrom_sizes = pd.read_csv(chrom_dir + 'mm10.chrom.sizes', sep='\t', names=['chrom', 'chrom_len'])
chrom_map = dict(zip(chrom_sizes['chrom'], chrom_sizes['chrom_len']))

with open('/home/azstephe/liverRegression/regression_liver/data/chrom_sizes/' + species + '_chrom_sizes.json', 'w') as f:
    json.dump(chrom_map, f)

# narrowPeak_dir = '/home/azstephe/liverRegression/regression_liver/data/raw/'

# cols = ['chrom', 'start', 'end', 'name', 'score', 'strand', 'signal', 'p', 'q', 'peak_offset']
# df = pd.read_csv(narrowPeak_dir + narrowPeak_file, sep='\t', names=cols)

# extension = 196608
# df['summit'] = df['start'] + df['peak_offset']
# df['new_start'] = df['summit'] - extension
# df['new_end'] = df['summit'] + extension

# # 4. Filter for boundaries
# # Get the max length for the specific chromosome of each peak
# df['max_len'] = df['chrom'].map(chrom_map)

# # Keep only if start >= 0 AND end <= chromosome length
# valid_sections = df[(df['new_start'] >= 0) & (df['new_end'] <= df['max_len'])].copy()

# print(f"Original peaks: {len(df)}")
# print(f"Peaks kept: {len(valid_sections)}")
# print(f"Peaks lost: {len(df) - len(valid_sections)}")

# outdir = '/home/azstephe/liverRegression/regression_liver/data/enformer_inputs/'
# valid_sections[['chrom', 'new_start', 'new_end', 'name', 'signal']].to_csv(outdir + narrowPeak_file, sep='\t', index=False, header=False)

In [38]:
species = 'macaque'

chrom_dir = '/home/azstephe/liverRegression/regression_liver/data/chrom_sizes/'
chrom_sizes = pd.read_csv(chrom_dir + 'rheMac8.chrom.sizes', sep='\t', names=['chrom', 'chrom_len'])
chrom_map = dict(zip(chrom_sizes['chrom'], chrom_sizes['chrom_len']))
with open('/home/azstephe/liverRegression/regression_liver/data/chrom_sizes/' + species + '_chrom_sizes.json', 'w') as f:
    json.dump(chrom_map, f)

In [39]:
import pandas as pd

species = 'pig'
narrowPeak_file = species + '_liver_pos_ALL.narrowPeak'

chrom_dir = '/home/azstephe/liverRegression/regression_liver/data/chrom_sizes/'
chrom_sizes = pd.read_csv(chrom_dir + 'susScr3.chrom.sizes', sep='\t', names=['chrom', 'chrom_len'])
chrom_map = dict(zip(chrom_sizes['chrom'], chrom_sizes['chrom_len']))
with open('/home/azstephe/liverRegression/regression_liver/data/chrom_sizes/' + species + '_chrom_sizes.json', 'w') as f:
    json.dump(chrom_map, f)
# narrowPeak_dir = '/home/azstephe/liverRegression/regression_liver/data/raw/'

# cols = ['chrom', 'start', 'end', 'name', 'score', 'strand', 'signal', 'p', 'q', 'peak_offset']
# df = pd.read_csv(narrowPeak_dir + narrowPeak_file, sep='\t', names=cols)

# extension = 196608
# df['summit'] = df['start'] + df['peak_offset']
# df['new_start'] = df['summit'] - extension
# df['new_end'] = df['summit'] + extension

# # 4. Filter for boundaries
# # Get the max length for the specific chromosome of each peak
# df['max_len'] = df['chrom'].map(chrom_map)

# # Keep only if start >= 0 AND end <= chromosome length
# valid_sections = df[(df['new_start'] >= 0) & (df['new_end'] <= df['max_len'])].copy()

# print(f"Original peaks: {len(df)}")
# print(f"Peaks kept: {len(valid_sections)}")
# print(f"Peaks lost: {len(df) - len(valid_sections)}")

# outdir = '/home/azstephe/liverRegression/regression_liver/data/enformer_inputs/'
# valid_sections[['chrom', 'new_start', 'new_end', 'name', 'signal']].to_csv(outdir + narrowPeak_file, sep='\t', index=False, header=False)

In [40]:
import pandas as pd

species = 'rat'
narrowPeak_file = species + '_liver_pos_ALL.narrowPeak'

chrom_dir = '/home/azstephe/liverRegression/regression_liver/data/chrom_sizes/'
chrom_sizes = pd.read_csv(chrom_dir + 'rn6.chrom.sizes', sep='\t', names=['chrom', 'chrom_len'])
chrom_map = dict(zip(chrom_sizes['chrom'], chrom_sizes['chrom_len']))
with open('/home/azstephe/liverRegression/regression_liver/data/chrom_sizes/' + species + '_chrom_sizes.json', 'w') as f:
    json.dump(chrom_map, f)
# narrowPeak_dir = '/home/azstephe/liverRegression/regression_liver/data/raw/'

# cols = ['chrom', 'start', 'end', 'name', 'score', 'strand', 'signal', 'p', 'q', 'peak_offset']
# df = pd.read_csv(narrowPeak_dir + narrowPeak_file, sep='\t', names=cols)

# extension = 196608
# df['summit'] = df['start'] + df['peak_offset']
# df['new_start'] = df['summit'] - extension
# df['new_end'] = df['summit'] + extension

# # 4. Filter for boundaries
# # Get the max length for the specific chromosome of each peak
# df['max_len'] = df['chrom'].map(chrom_map)

# # Keep only if start >= 0 AND end <= chromosome length
# valid_sections = df[(df['new_start'] >= 0) & (df['new_end'] <= df['max_len'])].copy()

# print(f"Original peaks: {len(df)}")
# print(f"Peaks kept: {len(valid_sections)}")
# print(f"Peaks lost: {len(df) - len(valid_sections)}")

# outdir = '/home/azstephe/liverRegression/regression_liver/data/enformer_inputs/'
# valid_sections[['chrom', 'new_start', 'new_end', 'name', 'signal']].to_csv(outdir + narrowPeak_file, sep='\t', index=False, header=False)

In [41]:
import pandas as pd

species = 'cow'
narrowPeak_file = species + '_liver_pos_ALL_refSeqName.bed'

chrom_dir = '/home/azstephe/liverRegression/regression_liver/data/chrom_sizes/'
chrom_sizes = pd.read_csv(chrom_dir + 'Btau_5.0.1.chrom.sizes', sep='\t', names=['chrom', 'chrom_len'])
chrom_map = dict(zip(chrom_sizes['chrom'], chrom_sizes['chrom_len']))
with open('/home/azstephe/liverRegression/regression_liver/data/chrom_sizes/' + species + '_chrom_sizes.json', 'w') as f:
    json.dump(chrom_map, f)
# narrowPeak_dir = '/home/azstephe/liverRegression/regression_liver/data/raw/'

# cols = ['chrom', 'start', 'end', 'name', 'score', 'strand', 'signal', 'p', 'q', 'peak_offset']
# df = pd.read_csv(narrowPeak_dir + narrowPeak_file, sep='\t', names=cols)

# extension = 196608
# df['summit'] = df['start'] + df['peak_offset']
# df['new_start'] = df['summit'] - extension
# df['new_end'] = df['summit'] + extension

# # 4. Filter for boundaries
# # Get the max length for the specific chromosome of each peak
# df['max_len'] = df['chrom'].map(chrom_map)

# # Keep only if start >= 0 AND end <= chromosome length
# valid_sections = df[(df['new_start'] >= 0) & (df['new_end'] <= df['max_len'])].copy()

# print(f"Original peaks: {len(df)}")
# print(f"Peaks kept: {len(valid_sections)}")
# print(f"Peaks lost: {len(df) - len(valid_sections)}")

# outdir = '/home/azstephe/liverRegression/regression_liver/data/enformer_inputs/'
# valid_sections[['chrom', 'new_start', 'new_end', 'name', 'signal']].to_csv(outdir + species + '_liver_pos_ALL_refSeqName.narrowPeak', sep='\t', index=False, header=False)

In [47]:
mapping_file_path = '/home/azstephe/liverRegression/regression_liver/data/genomes/ChromNameToRefSeqName_Btau_5.0.1.txt'
input_json_path = '/home/azstephe/liverRegression/regression_liver/data/chrom_sizes/cow_chrom_sizes_RefSeq.json'
output_json_path = '/home/azstephe/liverRegression/regression_liver/data/chrom_sizes/cow_chrom_sizes.json'
output_txt_path = '/home/azstephe/liverRegression/regression_liver/data/chrom_sizes/cow_chrom_names.sizes'

# 2. Load the mapping file {RefSeq: ChromName}
# Assumption based on filename: Col 0 = ChromName, Col 1 = RefSeqName
mapping = {}
if os.path.exists(mapping_file_path):
    with open(mapping_file_path, 'r') as f:
        for line in f:
            if line.strip() and not line.startswith('#'):
                parts = line.split()
                if len(parts) >= 2:
                    chrom_name = parts[0]
                    refseq_id = parts[1]
                    mapping[refseq_id] = chrom_name

if os.path.exists(input_json_path):
    with open(input_json_path, 'r') as f:
        old_data = json.load(f)
    print(f"Loaded {len(old_data)} entries from original JSON.")

    # 4. Create the new dictionary with ChromNames
    # We only keep keys that exist in our mapping file
    new_data = {mapping[k]: v for k, v in old_data.items() if k in mapping}
    
    # 5. Save the results
    # Save as JSON
    with open(output_json_path, 'w') as f:
        json.dump(new_data, f, indent=4)
        
    # Save as standard tab-separated .sizes file
    with open(output_txt_path, 'w') as f:
        for k, v in sorted(new_data.items()):
            f.write(f"{k}\t{v}\n")
            
    print(f"✅ Created {output_json_path}")
    print(f"✅ Created {output_txt_path}")
    
    # 6. Preview the first 5 entries
    print("\n--- Preview of New Mapping ---")
    for i, (k, v) in enumerate(list(new_data.items())[:5]):
        print(f"Chromosome {k}: {v} bp")
else:
    print(f"❌ Error: Input JSON {input_json_path} not found.")

Loaded 3143 entries from original JSON.
✅ Created /home/azstephe/liverRegression/regression_liver/data/chrom_sizes/cow_chrom_sizes.json
✅ Created /home/azstephe/liverRegression/regression_liver/data/chrom_sizes/cow_chrom_names.sizes

--- Preview of New Mapping ---
Chromosome chr1: 158972876 bp
Chromosome chr2: 137479425 bp
Chromosome chr3: 121888057 bp
Chromosome chr4: 121284772 bp
Chromosome chr5: 121584146 bp


In [50]:
import os

keyword = '_liver_TEST_500bp.bed'
test_dir = '/home/azstephe/liverRegression/regression_liver/data/test_splits/'

out_dir = '/home/azstephe/liverRegression/regression_liver/data/enformer_inputs/'

for d in ['log_pos_LiuAll', 'neg', 'log_LiuAll_test1', 'log_test2', 'log_test3']:
    dr = os.path.join(test_dir, d)
    for file in os.listdir(dr):
        filepath = os.path.join(dr, file)
        if 'non' not in filepath and keyword in filepath:
            species = filepath.split('/')[-1].split('_')[0]
            cols = ['chrom', 'start', 'end', 'peak', 'signal']
            df = pd.read_csv(filepath, sep='\t', names=cols)
            
            extension = 196608
            df['new_start'] = df['start'] - extension + 250
            df['new_end'] = df['end'] + extension - 250

            with open('/home/azstephe/liverRegression/regression_liver/data/chrom_sizes/' + species + '_chrom_sizes.json', 'r') as f:
                chrom_map = json.load(f)

            df['max_len'] = df['chrom'].map(chrom_map)

            valid_sections = df[(df['new_start'] >= 0) & (df['new_end'] <= df['max_len'])].copy()

            print(filepath)
            print(f"Original peaks: {len(df)}")
            print(f"Peaks kept: {len(valid_sections)}")
            print(f"Peaks lost: {len(df) - len(valid_sections)}")

            out_dr = os.path.join(out_dir, d)
            if not os.path.exists(out_dr):
                os.makedirs(out_dr)
            valid_sections[['chrom', 'new_start', 'new_end', 'peak', 'signal']].to_csv(os.path.join(out_dr, file), sep='\t', index=False, header=False)


/home/azstephe/liverRegression/regression_liver/data/test_splits/log_pos_LiuAll/cow_liver_TEST_500bp.bed
Original peaks: 1284
Peaks kept: 1280
Peaks lost: 4
/home/azstephe/liverRegression/regression_liver/data/test_splits/log_pos_LiuAll/macaque_liver_TEST_500bp.bed
Original peaks: 2624
Peaks kept: 2619
Peaks lost: 5
/home/azstephe/liverRegression/regression_liver/data/test_splits/log_pos_LiuAll/rat_liver_TEST_500bp.bed
Original peaks: 2576
Peaks kept: 2575
Peaks lost: 1
/home/azstephe/liverRegression/regression_liver/data/test_splits/log_pos_LiuAll/pig_liver_TEST_500bp.bed
Original peaks: 1627
Peaks kept: 1624
Peaks lost: 3
/home/azstephe/liverRegression/regression_liver/data/test_splits/neg/pig_liver_TEST_500bp.bed
Original peaks: 3057
Peaks kept: 2860
Peaks lost: 197
/home/azstephe/liverRegression/regression_liver/data/test_splits/neg/rat_liver_TEST_500bp.bed
Original peaks: 4405
Peaks kept: 4404
Peaks lost: 1
/home/azstephe/liverRegression/regression_liver/data/test_splits/neg/macaq

In [36]:
chrom_map

'cow_chrom_map'

In [None]:
import tensorflow as tf
import tensorflow_hub as hub

enformer = hub.Module('https://tfhub.dev/deepmind/enformer/1')

SEQ_LENGTH = 393_216

# Numpy array [batch_size, SEQ_LENGTH, 4] one hot encoded in order 'ACGT'. The
# `one_hot_encode` function is available in `enformer.py` and outputs can be
# stacked to form a batch.
inputs = tf.zeros((1, SEQ_LENGTH, 4), dtype=tf.float32)
predictions = enformer.predict_on_batch(inputs)
predictions['human'].shape  # [batch_size, 896, 5313]
predictions[mouse].shape  # [batch_size, 896, 1643]

In [None]:
enformer_model = hub.load("https://tfhub.dev/deepmind/enformer/1").signatures['serving_default']

def prepare_input(sequence: str):
    target_length = 393216
    if len(sequence) != target_length:
        print("input length != 393216")
        return

    # Use your encoding function
    encoded = one_hot_encode(sequence)
    
    # Add the Batch Dimension: (393216, 4) -> (1, 393216, 4)
    return encoded[np.newaxis, ...]

# 2. Run Inference
input_tensor = tf.convert_to_tensor(prepare_input("ATCG..."), dtype=tf.float32)
outputs = enformer_model(input_tensor)

# The outputs are a dictionary: 'human' and 'mouse'
human_predictions = outputs['human'].numpy()

In [None]:
bedtools getfasta -fi GCF_000003205.7_Btau_5.0.1_genomic.fna -bed enformer_inputs.bed -fo enformer_sequences.fa