In [None]:
import h5py
import pandas as pd

def hdf5_group_to_dataframe(group):
    data_dict = {}
    for key, item in group.items():
        if key != '_all_tag_keys':
            data_dict[key] = item[()]
    return pd.DataFrame(data_dict)

def read_h5_to_dataframe(file_path):
    with h5py.File(file_path, 'r') as f:
        dataset_name = list(f.keys())[0]
        print(f[dataset_name].keys())
        data = f[dataset_name]['features']

        df=hdf5_group_to_dataframe(data)
    return df

def byte_to_str(value):
    if isinstance(value, bytes):
        return value.decode('utf-8')
    else:
        return value
    

file = '../dataset/GSM5123953.h5'
df = read_h5_to_dataframe(file)
df_atac = df[df['feature_type']==b'Peaks']
df_atac = df_atac.applymap(byte_to_str)
df_atac = df_atac[df_atac['interval'].str.startswith('chr')]
print(df_atac)

<KeysViewHDF5 ['barcodes', 'data', 'features', 'indices', 'indptr', 'shape']>
       feature_type  genome                      id                interval  \
36601         Peaks  GRCh38        chr1:10002-10473        chr1:10002-10473   
36602         Peaks  GRCh38      chr1:180625-181988      chr1:180625-181988   
36603         Peaks  GRCh38      chr1:267838-268162      chr1:267838-268162   
36604         Peaks  GRCh38      chr1:629773-630119      chr1:629773-630119   
36605         Peaks  GRCh38      chr1:633794-634259      chr1:633794-634259   
...             ...     ...                     ...                     ...   
123932        Peaks  GRCh38  chrY:56841958-56843017  chrY:56841958-56843017   
123933        Peaks  GRCh38  chrY:56843811-56844129  chrY:56843811-56844129   
123934        Peaks  GRCh38  chrY:56847098-56848101  chrY:56847098-56848101   
123935        Peaks  GRCh38  chrY:56849235-56849566  chrY:56849235-56849566   
123936        Peaks  GRCh38  chrY:56850169-56851342  

In [None]:
from pyfaidx import Fasta

genome_file = r'GCF_000001405.40_GRCh38.p14_genomic.fna' # download from NCBI
dict_chr = {'chr1':'NC_000001.11','chr2':'NC_000002.12','chr3':'NC_000003.12','chr4':'NC_000004.12','chr5':'NC_000005.10',
            'chr6':'NC_000006.12','chr7':'NC_000007.14','chr8':'NC_000008.11','chr9':'NC_000009.12','chr10':'NC_000010.11',
            'chr11':'NC_000011.10','chr12':'NC_000012.12','chr13':'NC_000013.11','chr14':'NC_000014.9','chr15':'NC_000015.10',
            'chr16':'NC_000016.10','chr17':'NC_000017.11','chr18':'NC_000018.10','chr19':'NC_000019.10','chr20':'NC_000020.11',
            'chr21':'NC_000021.9','chr22':'NC_000022.11','chrX':'NC_000023.11','chrY':'NC_000024.10','chrM':'NC_012920.1'
            }

genome = Fasta(genome_file)

In [None]:
sequences = []
for peak in df_atac['interval'].values:
    [chr,loc] = peak.split(':')
    [start,end]=loc.split('-')
    sequences.append(str(genome[dict_chr[chr]][int(start):int(end)]))

df_sequence = pd.DataFrame({'interval':df_atac['interval'].values, 'sequence':sequences})
df_sequence.to_csv('atac_sequence.csv')