# Upgrade data (from previous hg19 to hg38)
Mode: Pull directly from ensemble use gene_id

### Defines rules for pulling sequences from Ensemble

In [None]:
import torch
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm  
import time

def fetch_ensembl_gene_info(gene_id):
    url = f"http://rest.ensembl.org/lookup/id/{gene_id}?content-type=application/json"
    response = requests.get(url)
    if response.ok:
        return gene_id, response.json()
    else:
        print(f"Error fetching data for gene ID {gene_id} from Ensembl: {response.text}")
        return gene_id, None

def fetch_genomic_sequence(chromosome, start, end):
    sequence_url = f"http://rest.ensembl.org/sequence/region/human/{chromosome}:{start}-{end}?content-type=text/plain"
    response = requests.get(sequence_url)
    if response.ok:
        return response.text
    else:
        print(f"Error fetching sequence for region {chromosome}:{start}-{end} from Ensembl: {response.text}")
        return None

def process_gene_id(gene_id, label):
    gene_id, gene_info = fetch_ensembl_gene_info(gene_id)
    if gene_info:
        try:
            chromosome = gene_info['seq_region_name']
            start = gene_info['start'] - 80000  # 80,000 bases upstream
            end = gene_info['start'] + 80000    # 80,000 bases downstream
            if start < 0: 
                print(f"Invalid start position for {gene_id}. Skipping.")
                return None
            sequence = fetch_genomic_sequence(chromosome, start, end)
            if sequence:
                return {'gene_id': gene_id, 'sequence': sequence, 'label': label}
        except Exception as e:
            print(f"Error processing gene ID {gene_id}: {e}")
    else:
        print(f"Gene ID {gene_id} not found on Ensembl.")
    return None

### Execute the pull rule

In [None]:
input_data = torch.load('../../data/train.pt')

results = []
missing_gene_ids = []


with ThreadPoolExecutor(max_workers=6) as executor:
    futures = [
        executor.submit(process_gene_id, gene_id.decode('utf-8'), label)
        for gene_id, label in zip(input_data['gene_id'], input_data['labels'])
    ]
    

    for i, future in enumerate(tqdm(as_completed(futures), total=len(futures), desc="Processing gene IDs"), start=1):
        result = future.result()
        if result:
            results.append(result)
        else:
            missing_gene_ids.append(result['gene_id'] if result else None)
        
        if i % 1000 == 0:
            partial_save_path = f'C10_hyena_160kbp_train_dataset_part_{i // 1000}.pt'
            torch.save({
                'gene_id': [r['gene_id'] for r in results],
                'labels': [r['label'] for r in results],
                'sequences': [r['sequence'] for r in results]
            }, partial_save_path)
            print(f"\nProgress saved at {i} entries in {partial_save_path}. Sleeping for 1 second.")
            time.sleep(0.1)  # Take breaks to reduce API stress

torch.save({
    'gene_id': [result['gene_id'] for result in results if result],
    'labels': [result['label'] for result in results if result],
    'sequences': [result['sequence'] for result in results if result]
}, 'C10_hyena_160kbp_train_dataset.pt')

# Print a list of gene_id not found
print("Processing complete.")
print(f"Total gene IDs processed: {len(input_data['gene_id'])}")
print(f"Missing gene IDs: {len(missing_gene_ids)}")
if missing_gene_ids:
    print("The following gene IDs could not be found in Ensembl:")
    for gene_id in missing_gene_ids:
        print(gene_id)


  input_data = torch.load('../../data/train.pt')


Error fetching data for gene ID ENSG00000215271 from Ensembl: {"error":"ID 'ENSG00000215271' not found"}
Gene ID ENSG00000215271 not found on Ensembl.
Error fetching data for gene ID ENSG00000167945 from Ensembl: {"error":"ID 'ENSG00000167945' not found"}
Gene ID ENSG00000167945 not found on Ensembl.
Error fetching data for gene ID ENSG00000213865 from Ensembl: {"error":"ID 'ENSG00000213865' not found"}
Gene ID ENSG00000213865 not found on Ensembl.
Error fetching sequence for region 16:-2993-157007 from Ensembl: {"error":"-2993 is not a valid start"}
Error fetching data for gene ID ENSG00000256222 from Ensembl: {"error":"ID 'ENSG00000256222' not found"}
Gene ID ENSG00000256222 not found on Ensembl.
Error processing gene ID ENSG00000138061: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))
Error fetching data for gene ID ENSG00000116957 from Ensembl: {"error":"ID 'ENSG00000116957' not found"}
Gene ID ENSG00000116957 not found on Ensembl.
Error fetching data f

#### Check if there are any duplicates in the sequence and delete them if there are

In [None]:
import torch

# Load the dataset
file_path = 'C10_hyena_160kbp_train_dataset_final.pt'
data = torch.load(file_path)

# Step 1: Check for duplicate gene_ids and remove duplicates
unique_data = {}
for i, gene_id in enumerate(data['gene_id']):
    if gene_id not in unique_data:
        # Keep only the first occurrence of each unique gene_id
        unique_data[gene_id] = {
            'gene_id': gene_id,
            'labels': data['labels'][i],
            'sequences': data['sequences'][i]
        }

processed_data = {
    'gene_id': [],
    'labels': [],
    'sequences': []
}
for entry in unique_data.values():
    processed_data['gene_id'].append(entry['gene_id'])
    processed_data['labels'].append(entry['labels'])
    processed_data['sequences'].append(entry['sequences'])

filtered_data = {
    'gene_id': [],
    'labels': [],
    'sequences': []
}
for i, sequence in enumerate(processed_data['sequences']):
    if sequence:  # Only keep non-empty sequences
        filtered_data['gene_id'].append(processed_data['gene_id'][i])
        filtered_data['labels'].append(processed_data['labels'][i])
        filtered_data['sequences'].append(sequence)

# Save the filtered dataset
torch.save(filtered_data, 'filtered_C10_hyena_160kbp_train_dataset_final.pt')

# Verify results
print(f"Total entries after filtering: {len(filtered_data['gene_id'])}")
print(f"Sample entry: {filtered_data['gene_id'][0]}, {filtered_data['labels'][0]}, {filtered_data['sequences'][0]}")


  data = torch.load(file_path)


Total entries after filtering: 15181
Sample entry: ENSG00000187545, 0, ATCACTGCAACCTCCACATCCTGGGTTCAAGCAATTCTCCTCCCTCAGCCTCTCCAGTAGCTGGGACTACAGGTACATGCCACCATGCCTGGTTAATTTTTTTTATTATACTTTAAGTTTTAGGGTACATGTGTACAATGTGCAGGTTACTTACATATGAATACATGTGCCATGTTGGTGTGCTGCACCCAGTAACTCGCCATTTAACATTAGGTATATCTCCAAATGATATCCCTCCCCCCTCCCCCCACCCCACAACAGGCCCCTCTGTGTGATGTTCCCCTTCCTGTGTCCCTGTGTTCTCATTGTTCAATTCCCACCTATGAGTGAGAACATGCAGTGTTCGGTTTTATGTCCTTGCGATAGTTTGCTGAGAATGATGGTTTTCAGCTTCATCCATGTCCCTACAAAGGACATGAACTCATCTTTTTTATGGTTGCATAGTATTCCATGTTGTATATGTGCCACATTTTCTTAATCCAGTCTATCATTGTTGGACATTTGGGTTGGTTCTAAGTCTTTGCTATTGTGAATAGTGCCGCAATAAACATACTTGTGCATGTGTCTTTATAGCAGCATGATTTATAATCCTTTGGGTATATACCCAGTAATGGGATGGCTGGGTCAAATGGTATTTCTAGTTCTAGATCCCTGACGAATTGCCACACTTACTTCCACAATGGTTGAACTAGTTTACAGTCCCACCAACTGTGTAAAACTGTTCCTATTTCTCCACATCCTCTCCAGCACCCCTTGTTTCCTGACTTTTTAATGATCACCATTCTAATTGGTATGAGATGGTATCTCATTGTGGTTTTGATTTGAATTTCTCTGATGGCCGGTGATGATGAGCATTTTTTCATGTGATGATGAGCATTTTTTCATGTCTCTTTTGGCTGCATAAATGTCTTCTTTTGAGAAGTGTCTGT

## dataset process
The file from the 160kbp sequence, divided into 3 files, plus 160kbp this file itself, a total of four, the role are as follows
1. 160kbp: Hyena to train
2. 20kbp: Hyena to train
3. 12kbp: ntv2 to train
4. 1kbp: cdgpt to train
Fields in the 
##### file content
gene_id: Ensemble gene id
sequences: DNA Sequences (GTAC)
labels: labels of the documents given by the organizing committee (unchanged)

In [21]:
import torch

# 1. 加载数据
data = torch.load('C10_hyena_160kbp_train_dataset.pt')

# 2. 初始化三个新的数据字典
hyena_data = {'gene_id': [], 'sequences': [], 'labels': []}
ntv2_data = {'gene_id': [], 'sequences': [], 'labels': []}
cdgpt_data = {'gene_id': [], 'sequences': [], 'labels': []}

# 3. 遍历数据并截取指定范围的序列
for i in range(len(data['sequences'])):
    sequence = data['sequences'][i]
    gene_id = data['gene_id'][i]
    label = data['labels'][i]
    
    # 截取每个指定范围的序列并保存到对应数据字典
    hyena_data['gene_id'].append(gene_id)
    hyena_data['sequences'].append(sequence[70000:90000])
    hyena_data['labels'].append(label)
    
    ntv2_data['gene_id'].append(gene_id)
    ntv2_data['sequences'].append(sequence[70000:82288])
    ntv2_data['labels'].append(label)
    
    cdgpt_data['gene_id'].append(gene_id)
    cdgpt_data['sequences'].append(sequence[79027:80050])
    cdgpt_data['labels'].append(label)

# 4. 保存分割后的数据到新文件
torch.save(hyena_data, 'C10_hyena_20kbp_train_dataset.pt')
torch.save(ntv2_data, 'C10_ntv2_12kbp_train_dataset.pt')
torch.save(cdgpt_data, 'C10_cdgpt_1kbp_train_dataset.pt')

print("Data has been successfully split and saved.")


  data = torch.load('C10_hyena_160kbp_train_dataset.pt')


Data has been successfully split and saved.


## File verification
confirm that these 4 files are in line with the training requirements

In [26]:
import torch
# file_path = 'C10_hyena_160kbp_valid_dataset.pt'
# file_path = 'C10_hyena_160kbp_test_dataset.pt'
file_path = 'C10_hyena_160kbp_train_dataset.pt'
data = torch.load(file_path)
print(f"sequences len is:{len(data['sequences'][0])}")
print(f"datasets len is: {len(data['labels'])}")
print(f"table head is :{data.keys()}")

for i in range(3):
    row_data = {key: value[i] for key, value in data.items()}
    print(f"Row {i + 1}: {row_data}")

  data = torch.load(file_path)


sequences len is:160001
datasets len is: 15181
table head is :dict_keys(['gene_id', 'labels', 'sequences'])
Row 1: {'gene_id': 'ENSG00000187545', 'labels': 0, 'sequences': 'ATCACTGCAACCTCCACATCCTGGGTTCAAGCAATTCTCCTCCCTCAGCCTCTCCAGTAGCTGGGACTACAGGTACATGCCACCATGCCTGGTTAATTTTTTTTATTATACTTTAAGTTTTAGGGTACATGTGTACAATGTGCAGGTTACTTACATATGAATACATGTGCCATGTTGGTGTGCTGCACCCAGTAACTCGCCATTTAACATTAGGTATATCTCCAAATGATATCCCTCCCCCCTCCCCCCACCCCACAACAGGCCCCTCTGTGTGATGTTCCCCTTCCTGTGTCCCTGTGTTCTCATTGTTCAATTCCCACCTATGAGTGAGAACATGCAGTGTTCGGTTTTATGTCCTTGCGATAGTTTGCTGAGAATGATGGTTTTCAGCTTCATCCATGTCCCTACAAAGGACATGAACTCATCTTTTTTATGGTTGCATAGTATTCCATGTTGTATATGTGCCACATTTTCTTAATCCAGTCTATCATTGTTGGACATTTGGGTTGGTTCTAAGTCTTTGCTATTGTGAATAGTGCCGCAATAAACATACTTGTGCATGTGTCTTTATAGCAGCATGATTTATAATCCTTTGGGTATATACCCAGTAATGGGATGGCTGGGTCAAATGGTATTTCTAGTTCTAGATCCCTGACGAATTGCCACACTTACTTCCACAATGGTTGAACTAGTTTACAGTCCCACCAACTGTGTAAAACTGTTCCTATTTCTCCACATCCTCTCCAGCACCCCTTGTTTCCTGACTTTTTAATGATCACCATTCTAATTGGTATGAGATGGTATCTCATTGTGGTTTTGATTTGAAT

In [27]:
import torch
# file_path = 'C10_hyena_20kbp_valid_dataset.pt'
# file_path = 'C10_hyena_20kbp_test_dataset.pt'
file_path = 'C10_hyena_160kbp_valid_dataset.pt'
data = torch.load(file_path)
print(f"sequences len is:{len(data['sequences'][0])}")
print(f"datasets len is: {len(data['labels'])}")
print(f"table head is :{data.keys()}")

for i in range(3):
    row_data = {key: value[i] for key, value in data.items()}
    print(f"Row {i + 1}: {row_data}")

sequences len is:160001
datasets len is: 988
table head is :dict_keys(['gene_id', 'sequences', 'labels'])
Row 1: {'gene_id': 'ENSG00000212900', 'sequences': 'CCCTCCAGGTGATCCTGATGTGAGCCAAGTTTGAGAACCCCTGCACTAAAGAAACGTCCTCGGGTCTCCTTCTGTTCCCCACCTTTTGCTGCCAGAATTCGAGTCCTTACTCCTTGCTAGACTTTATGACCTCTTCCCTCCACATACTGTTGCTACATTCCTTTAATAATGAATTTTTATGTTTAGTTCCCGATTTACACATCATCAGCAGGGTAGGTGTGTAAAAATAATAGTGAAGACATGAGTCTATAAAGGTGTGCATTTGATTGGATTAACAGTATTAGATGGATCTATCTATCCACTCCACCATTTTATGACACTGTGGGCAAATATGTTAACATAAATCCTAAATTCCAAAGTGATACATTATACTTCTCCGGATTAGCCATCGGTTTGTTTAGTTGGATTTTGGGTTATAGAATGATAATTGCTATTTATTCAAAGTGTTTCTGGAATGAAAACAATAGAACTTATTGCCACCTCCTTTATGTGGGTTGGGTGTATGTTGGTCAAATTCAAATACAAACAAAATCTTCTTTGGTTGGTAATTCACCTGAATGCATAGGCATTAATTGGTTTGTAATGGGCATTTTGAAACATTTTTACGCTGATTTAAGTGTGGGGGAAGATTATTCTGTGTTACCTAGAGCCCAAAATCATGCAATTGTCTCTTTATTCAGGCAAATGGTTTCATTTCTCTCTTCTTTCCTCTCCTTAGTGAAACAGAGGCAATTTATTGATAGGGTGATCAGCCACCCCAGTTTGCCTGGGACTAATGAGTTTCTGGGCACACTGGGATGAATTGGTCACCTTAATTGATACAGCGAAAATTAATAAGACTT

  data = torch.load(file_path)


In [24]:
import torch
# file_path = 'C10_ntv2_12kbp_valid_dataset.pt'
# file_path = 'C10_ntv2_12kbp_test_dataset.pt'
file_path = 'C10_ntv2_12kbp_train_dataset.pt'
data = torch.load(file_path)
print(f"sequences len is:{len(data['sequences'][0])}")
print(f"datasets len is: {len(data['labels'])}")
print(f"table head is :{data.keys()}")

for i in range(10):
    row_data = {key: value[i] for key, value in data.items()}
    print(f"Row {i + 1}: {row_data}")

sequences len is:12288
datasets len is: 15181
table head is :dict_keys(['gene_id', 'sequences', 'labels'])
Row 1: {'gene_id': 'ENSG00000187545', 'sequences': 'TCAGTGCCATTAGAGGAGAGGTTCCTGTTACCTCCATGGACCTTGCGTGGTGAGCAGTGCTTTCCCTGAGGAGCTGGTGAATGGCCAAGTCCTCTCGGCTTCCTCACCACCACCATCCCCCTTGGGCCTCCTCACTTCTCACGACCCAGCTGTTCCTTCAGTTGGACACCTGGGCCCTCCCCACCAGCCCACCTGGGCCACCTCACCTGGGACGAACCCCTAGGTTAAGCAGTGCATCCAGCCCATCGAGCACAGCTTGGAAGGCCTCCAGACAAGGCATCTTTATCAGAGGCCTCAGAGGGAGGCGGCGGAAGGGCCAGGACTGCACCATCAGCTTCAGGGCCTCACAGCGTCTCCTGCTGAAGGCCTCCATGAACAGTGGGGGGAAAAGTTCTGTGGGCAGCTCCTCCAGGGTGGACATGGCCAAAGCTTGGTCCCTTAGCAGGCTCCGCCCTGCAAGCTCCAGGAGTCTGGGTGGAGTCCAGATGCTCATCTTCATGAATCTGCAGGGAAAACTTCCAGAGGACAAACCCAGAGAAAAGGCATCTCTCTCGGGCCAAGCCCATGCAATCTCATCCTCTCCTATGGCCAAACTCACTGCTCTGGCAATGGTGAAAGAGTCCTCAGTTTACTCCAATTCTACTCTGTACTCAGTGGCCATTAAGCCAGCATTCTGCCTCTGCTGCATCAGCATGAGCGTCTCCGAAGCAGTGAGGAAGCAGGGCCACCACGAGCCCTTCCTTTCTATCCAGTGCTCCATCCAGTGACTAGTGAGTGTGGAGGAACCTGAAAGTGAACCCCTCCTACCATTGGGGGAAACTACTAATTACTCAAGGTTCTA

  data = torch.load(file_path)


In [25]:
import torch
# file_path = 'C10_cdgpt_1kbp_valid_dataset.pt'
# file_path = 'C10_cdgpt_1kbp_test_dataset.pt'
file_path = 'C10_cdgpt_1kbp_train_dataset.pt'
data = torch.load(file_path)
print(f"sequences len is:{len(data['sequences'][0])}")
print(f"datasets len is: {len(data['labels'])}")
print(f"table head is :{data.keys()}")

for i in range(10):
    row_data = {key: value[i] for key, value in data.items()}
    print(f"Row {i + 1}: {row_data}")

sequences len is:1023
datasets len is: 15181
table head is :dict_keys(['gene_id', 'sequences', 'labels'])
Row 1: {'gene_id': 'ENSG00000187545', 'sequences': 'AACATTACACATTTTATGCTTATATCAGAATTTCAGGCCAGGTGCAGTGACTAATGTCTACAATCTGAGCACTTTGGGAGGCTGAGGTGGATGGTTTGCCTGAAGTCAGGAGTTCAAGACCAGCCTGGTCAACATGGTGAAACCCCCGTTTCTACAAAAAATACAAAAAATAGCCAGGCATGGTGGCGGGTCCCTGTAGTTCCAGCTACTCAGGAGGCTGAGGCAGGAGAATTGCTTGAACCCAGGAGGCAGATTTCTAGAGACTTCTGATGTATAAATGTCTAAAACAGGTTGATCAATCATGGAAGACACCAGAAAGTTTCCATTCAGGTTCCATTTATTTTTGACATTTTTAAATAACCATCCTTGCAGGGGTAAGTCCTGCATCACTCTAGAACTTCAGGTTCCATTTCTAAGTCTAGGACACAGGTCCCTGAAGGCCTCATTGATGCCAAGTCAGCATTTTTACCCAGTCCTGCCCCTGGCTGAGTCACCTTTGTTTTTCCACTCACAGTGAGCACGTGCCTCAAATACGTGGCTGTGTGCTTCCTTTAAGAAGCGGCTGACCGGGCCCTGCTGCTCACACCTGTAAACCTGGCACTGTGGAAGGCCAAGGTGGTCAGATCACTTGAGGTCAGGAGTTTGAGGTCAGCCTTCGCCAACATCGTGAAGCCCTGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGGGGCATACACCCACAACACCAGCTACTTGGGAGGCTGAGGCAGGAGAATCACTTGAACCCAGGAGGTGGTGCTTGCAGTGAGCTGAGATTGTGCCACTGCACTTCATCCTGAGGGACACAGTGAGACTCT

  data = torch.load(file_path)
