# 看看这些数据里面长啥样

In [None]:
import os
import torch
from genomic_benchmarks.loc2seq import download_dataset
from genomic_benchmarks.dataset_getters.pytorch_datasets import (
    DemoMouseEnhancers,
    DrosophilaEnhancersStark,
    DemoHumanOrWorm,
    DemoCodingVsIntergenomicSeqs,
    HumanEnhancersCohn,
    HumanOcrEnsembl,
    HumanEnsemblRegulatory,
    HumanEnhancersEnsembl,
    HumanNontataPromoters
)

# 数据集列表及其对应的类
datasets = {
    'demo_mouse_enhancers': DemoMouseEnhancers,
    'drosophila_enhancers_stark': DrosophilaEnhancersStark,
    'demo_human_or_worm': DemoHumanOrWorm,
    'demo_coding_vs_intergenomic_seqs': DemoCodingVsIntergenomicSeqs,
    'human_enhancers_cohn': HumanEnhancersCohn,
    'human_ocr_ensembl': HumanOcrEnsembl,
    'human_ensembl_regulatory': HumanEnsemblRegulatory,
    'human_enhancers_ensembl': HumanEnhancersEnsembl,
    'human_nontata_promoters': HumanNontataPromoters
}

# 保存路径
output_dir = "genomic_benchmark_datasets"
os.makedirs(output_dir, exist_ok=True)

def save_dataset(dataset_name, dataset_class, version=0):
    """
    下载并保存指定数据集的训练和测试数据到对应文件夹。
    """
    print(f"Processing dataset: {dataset_name}")
    
    # 下载数据集
    download_dataset(dataset_name, version=version)
    
    # 加载训练集和测试集
    for split in ['train', 'test']:
        print(f"Loading {split} split for dataset: {dataset_name}")
        dset = dataset_class(split=split, version=version)
        
        # 提取序列和标签
        sequences = [seq for seq, label in dset]
        labels = [label for seq, label in dset]
        
        # 构建保存路径
        save_path = os.path.join(
            output_dir, 
            f"{split}_{dataset_name}.pt"
        )
        
        # 保存为 .pth 文件
        print(f"Saving {split} split to {save_path}")
        torch.save({"sequences": sequences, "labels": labels}, save_path)

# 遍历数据集，逐一处理
for dataset_name, dataset_class in datasets.items():
    try:
        save_dataset(dataset_name, dataset_class)
    except Exception as e:
        print(f"Failed to process dataset {dataset_name}: {e}")

Processing dataset: demo_mouse_enhancers
Failed to process dataset demo_mouse_enhancers: Dataset demo_mouse_enhancers not found.
Processing dataset: drosophila_enhancers_stark


Downloading...
From (original): https://drive.google.com/uc?id=1D8u3m09CNIv8e4-5rOu5wKuwcLl1eejs
From (redirected): https://drive.google.com/uc?id=1D8u3m09CNIv8e4-5rOu5wKuwcLl1eejs&confirm=t&uuid=4888a748-7a36-421a-ba66-94c575d99a8c
To: /Users/jacob/.genomic_benchmarks/drosophila_enhancers_stark.zip
100%|██████████| 6.34M/6.34M [00:00<00:00, 7.61MB/s]


Loading train split for dataset: drosophila_enhancers_stark
Saving train split to genomic_benchmark_datasets/train_drosophila_enhancers_stark.pth
Loading test split for dataset: drosophila_enhancers_stark
Saving test split to genomic_benchmark_datasets/test_drosophila_enhancers_stark.pth
Processing dataset: demo_human_or_worm


Downloading...
From (original): https://drive.google.com/uc?id=1JW0-eTB-rJXvFcglqBo3pFZi1kyIWC3X
From (redirected): https://drive.google.com/uc?id=1JW0-eTB-rJXvFcglqBo3pFZi1kyIWC3X&confirm=t&uuid=27a5a264-be00-496b-85d1-bdbdc7885777
To: /Users/jacob/.genomic_benchmarks/demo_human_or_worm.zip
100%|██████████| 28.9M/28.9M [00:04<00:00, 6.47MB/s]


Loading train split for dataset: demo_human_or_worm
Saving train split to genomic_benchmark_datasets/train_demo_human_or_worm.pth
Loading test split for dataset: demo_human_or_worm
Saving test split to genomic_benchmark_datasets/test_demo_human_or_worm.pth
Processing dataset: demo_coding_vs_intergenomic_seqs


Downloading...
From (original): https://drive.google.com/uc?id=1cpXg0ULuTGF7h1_HTYvc6p8M-ee43t-v
From (redirected): https://drive.google.com/uc?id=1cpXg0ULuTGF7h1_HTYvc6p8M-ee43t-v&confirm=t&uuid=7038edf6-96e8-4336-ba81-59c4ce8a485d
To: /Users/jacob/.genomic_benchmarks/demo_coding_vs_intergenomic_seqs.zip
100%|██████████| 33.9M/33.9M [00:05<00:00, 6.37MB/s]


Loading train split for dataset: demo_coding_vs_intergenomic_seqs
Saving train split to genomic_benchmark_datasets/train_demo_coding_vs_intergenomic_seqs.pth
Loading test split for dataset: demo_coding_vs_intergenomic_seqs
Saving test split to genomic_benchmark_datasets/test_demo_coding_vs_intergenomic_seqs.pth
Processing dataset: human_enhancers_cohn


Downloading...
From (original): https://drive.google.com/uc?id=176563cDPQ5Y094WyoSBF02QjoVQhWuCh
From (redirected): https://drive.google.com/uc?id=176563cDPQ5Y094WyoSBF02QjoVQhWuCh&confirm=t&uuid=9c1fee05-4c26-4b19-9ed2-f11e5f220d59
To: /Users/jacob/.genomic_benchmarks/human_enhancers_cohn.zip
100%|██████████| 11.9M/11.9M [00:01<00:00, 9.52MB/s]


Loading train split for dataset: human_enhancers_cohn
Saving train split to genomic_benchmark_datasets/train_human_enhancers_cohn.pth
Loading test split for dataset: human_enhancers_cohn
Saving test split to genomic_benchmark_datasets/test_human_enhancers_cohn.pth
Processing dataset: human_ocr_ensembl


Downloading...
From (original): https://drive.google.com/uc?id=1y_LInRF2aRXysigpwv_oU3Q67VVxfk18
From (redirected): https://drive.google.com/uc?id=1y_LInRF2aRXysigpwv_oU3Q67VVxfk18&confirm=t&uuid=791c0608-c29d-44fe-a4e3-5325e67fff2e
To: /Users/jacob/.genomic_benchmarks/human_ocr_ensembl.zip
100%|██████████| 59.0M/59.0M [00:06<00:00, 8.77MB/s]


Loading train split for dataset: human_ocr_ensembl
Saving train split to genomic_benchmark_datasets/train_human_ocr_ensembl.pth
Loading test split for dataset: human_ocr_ensembl
Saving test split to genomic_benchmark_datasets/test_human_ocr_ensembl.pth
Processing dataset: human_enhancers_ensembl


Downloading...
From (original): https://drive.google.com/uc?id=1gZBEV_RGxJE8EON5OObdrp5Tp8JL0Fxb
From (redirected): https://drive.google.com/uc?id=1gZBEV_RGxJE8EON5OObdrp5Tp8JL0Fxb&confirm=t&uuid=0eaf8434-8844-4e3f-9f4f-a9892fff7bd3
To: /Users/jacob/.genomic_benchmarks/human_enhancers_ensembl.zip
100%|██████████| 51.1M/51.1M [00:05<00:00, 9.85MB/s]


Loading train split for dataset: human_enhancers_ensembl
Saving train split to genomic_benchmark_datasets/train_human_enhancers_ensembl.pth
Loading test split for dataset: human_enhancers_ensembl
Saving test split to genomic_benchmark_datasets/test_human_enhancers_ensembl.pth
Processing dataset: human_nontata_promoters


Downloading...
From (original): https://drive.google.com/uc?id=1VdUg0Zu8yfLS6QesBXwGz1PIQrTW3Ze4
From (redirected): https://drive.google.com/uc?id=1VdUg0Zu8yfLS6QesBXwGz1PIQrTW3Ze4&confirm=t&uuid=bb3dd3ef-c910-470c-95d7-b454c7e42005
To: /Users/jacob/.genomic_benchmarks/human_nontata_promoters.zip
100%|██████████| 11.8M/11.8M [00:01<00:00, 8.03MB/s]


Loading train split for dataset: human_nontata_promoters
Saving train split to genomic_benchmark_datasets/train_human_nontata_promoters.pth
Loading test split for dataset: human_nontata_promoters
Saving test split to genomic_benchmark_datasets/test_human_nontata_promoters.pth


In [4]:
import os
import torch

# 文件夹路径
dataset_dir = "genomic_benchmark_datasets"

# 获取所有 .pth 文件
pth_files = [f for f in os.listdir(dataset_dir) if f.endswith(".pth")]

# 转换为 .pt 文件
for file in pth_files:
    pth_path = os.path.join(dataset_dir, file)
    pt_path = os.path.join(dataset_dir, file.replace(".pth", ".pt"))
    
    # 加载并重新保存
    data = torch.load(pth_path)
    torch.save(data, pt_path)
    print(f"Converted {file} to {pt_path}")

print("All files converted to .pt format.")

  data = torch.load(pth_path)


Converted test_human_ocr_ensembl.pth to genomic_benchmark_datasets/test_human_ocr_ensembl.pt
Converted test_human_enhancers_ensembl.pth to genomic_benchmark_datasets/test_human_enhancers_ensembl.pt
Converted train_human_ocr_ensembl.pth to genomic_benchmark_datasets/train_human_ocr_ensembl.pt
Converted test_human_ensembl_regulatory.pth to genomic_benchmark_datasets/test_human_ensembl_regulatory.pt
Converted train_human_ensembl_regulatory.pth to genomic_benchmark_datasets/train_human_ensembl_regulatory.pt
Converted test_human_enhancers_cohn.pth to genomic_benchmark_datasets/test_human_enhancers_cohn.pt
Converted test_human_nontata_promoters.pth to genomic_benchmark_datasets/test_human_nontata_promoters.pt
Converted train_human_enhancers_cohn.pth to genomic_benchmark_datasets/train_human_enhancers_cohn.pt
Converted train_drosophila_enhancers_stark.pth to genomic_benchmark_datasets/train_drosophila_enhancers_stark.pt
Converted train_human_enhancers_ensembl.pth to genomic_benchmark_datasets

In [10]:
import os
import torch
import pandas as pd

# 数据集路径
dataset_dir = "./genomic_benchmark_datasets"

# 数据集名称列表
datasets = [
    'drosophila_enhancers_stark',
    'demo_human_or_worm',
    'demo_coding_vs_intergenomic_seqs',
    'human_enhancers_cohn',
    'human_ocr_ensembl',
    'human_ensembl_regulatory',
    'human_enhancers_ensembl',
    'human_nontata_promoters'
]

# 初始化结果列表
dataset_summary = []

# 遍历数据集
for dataset in datasets:
    train_file = os.path.join(dataset_dir, f"train_{dataset}.pt")
    test_file = os.path.join(dataset_dir, f"test_{dataset}.pt")

    print(f"Processing dataset: {dataset}")
    
    train_length, test_length = 0, 0
    train_first_seq_len, test_first_seq_len = 0, 0
    train_positive, train_negative = 0, 0
    test_positive, test_negative = 0, 0

    # 加载训练集
    if os.path.exists(train_file):
        train_data = torch.load(train_file)
        train_length = len(train_data["sequences"])
        train_first_seq_len = len(train_data["sequences"][0]) if train_length > 0 else 0
        train_positive = sum(1 for label in train_data["labels"] if label == 1)
        train_negative = sum(1 for label in train_data["labels"] if label == 0)
        del train_data  # 释放内存

    # 加载测试集
    if os.path.exists(test_file):
        test_data = torch.load(test_file)
        test_length = len(test_data["sequences"])
        test_first_seq_len = len(test_data["sequences"][0]) if test_length > 0 else 0
        test_positive = sum(1 for label in test_data["labels"] if label == 1)
        test_negative = sum(1 for label in test_data["labels"] if label == 0)
        del test_data  # 释放内存

    # 保存到结果列表
    dataset_summary.append({
        "dataset": dataset,
        "train_length": train_length,
        "test_length": test_length,
        "train_first_seq_len": train_first_seq_len,
        "test_first_seq_len": test_first_seq_len,
        "train_positive": train_positive,
        "train_negative": train_negative,
        "test_positive": test_positive,
        "test_negative": test_negative
    })

# 转换为 Pandas DataFrame
summary_df = pd.DataFrame(dataset_summary)

# 打印并保存数据集详情
print(summary_df)
summary_df.to_csv("dataset_summary.csv", index=False)

# import ace_tools as tools; tools.display_dataframe_to_user(name="Dataset Summary Information", dataframe=summary_df)

Processing dataset: drosophila_enhancers_stark
Processing dataset: demo_human_or_worm
Processing dataset: demo_coding_vs_intergenomic_seqs
Processing dataset: human_enhancers_cohn
Processing dataset: human_ocr_ensembl
Processing dataset: human_ensembl_regulatory


  train_data = torch.load(train_file)
  test_data = torch.load(test_file)


Processing dataset: human_enhancers_ensembl
Processing dataset: human_nontata_promoters
                            dataset  train_length  test_length  \
0        drosophila_enhancers_stark          5184         1730   
1                demo_human_or_worm         75000        25000   
2  demo_coding_vs_intergenomic_seqs         75000        25000   
3              human_enhancers_cohn         20843         6948   
4                 human_ocr_ensembl        139804        34952   
5          human_ensembl_regulatory        231348        57713   
6           human_enhancers_ensembl        123872        30970   
7           human_nontata_promoters         27097         9034   

   train_first_seq_len  test_first_seq_len  train_positive  train_negative  \
0                 2205                2077            2592            2592   
1                  200                 200           37500           37500   
2                  200                 200           37500           37500   
3    