In [None]:
import h5py
import numpy as np
import hdf5plugin  # Ensure this is imported to enable the plugins


In [None]:
import numpy as np

def nucleotide_to_index(seq, encode_spec=None):
    """
    Convert a DNA sequence to integer indices.
    
    Parameters:
    seq (str): A string representing a DNA sequence.
    encode_spec (dict, optional): Encoding specification for nucleotides. Defaults to None.
    
    Returns:
    np.array: An array of integers representing the indices of nucleotides.
    """
    if encode_spec is None:
        encode_spec = {'A': 0, 'C': 1, 'G': 2, 'T': 3, 'N': 4}
    return np.array([encode_spec.get(nuc, 4) for nuc in seq], dtype=np.int8)

def bitpack_indices(indices):
    """
    Pack nucleotide indices into a 3-bit representation to include `N`.

    Parameters:
    indices (np.array): Array of nucleotide indices.

    Returns:
    np.array: Packed array of indices in 3-bit representation.
    """
    packed = np.packbits(indices.reshape(-1, 2), axis=-1, bitorder='little')
    return packed


def index_to_onehot(indices, encode_spec=None):
    """
    Convert nucleotide indices to one-hot encoding.
    
    Parameters:
    indices (np.array): Array of nucleotide indices.
    encode_spec (dict, optional): Encoding specification for nucleotides. Defaults to None.
    
    Returns:
    np.array: One-hot encoded representation of the indices.
    """
    if encode_spec is None:
        encode_spec = {'A': 0, 'C': 1, 'G': 2, 'T': 3, 'N': 4}
    num_classes = len(encode_spec)
    return np.eye(num_classes)[indices]
    
def unpack_bits(packed_data):
    """
    Unpack 3-bit packed data back to nucleotide indices.
    
    Parameters:
    packed_data (np.array): Packed array of nucleotide indices.
    
    Returns:
    np.array: Unpacked array of nucleotide indices.
    """
    unpacked = np.unpackbits(packed_data, axis=-1, bitorder='little').reshape(-1, 2)
    return unpacked


def parse_encode_dict(encode_spec):
    """
    Parse encoding specification into a dictionary.
    
    Parameters:
    encode_spec (str, list, or dict): Encoding specification.
    
    Returns:
    dict: Parsed encoding specification.
    """
    if not encode_spec:
        return {"A": 0, "C": 1, "G": 2, "T": 3, "N": 4}
    elif isinstance(encode_spec, (list, tuple, str)):
        return {base: i for i, base in enumerate(encode_spec)}
    elif isinstance(encode_spec, dict):
        return encode_spec
    else:
        raise TypeError("Please input as dict, list or string!")

In [None]:

def read_and_verify_h5(file_path):
    with h5py.File(file_path, 'r') as h5_file:
        for donor in h5_file.keys():
            donor_group = h5_file[donor]
            for chrom in donor_group.keys():
                print(f"Verifying data for Donor: {donor}, Chromosome: {chrom}")

                # Read SNP data and genotypes
                snp_data = donor_group[chrom]['snp_data'][:]
                # packed_genotypes = donor_group[chrom]['genotypes'][:]
                
                # # Unpack genotypes
                # unpacked_genotypes = np.unpackbits(packed_genotypes, axis=-1, bitorder='little').reshape(-1, 2)

                # # Convert indices back to nucleotides
                # ref_indices = snp_data['REF_IDX']
                # alt_indices = snp_data['ALT_IDX']
                # ref_nucleotides = ''.join([list(nucleotide_to_index.keys())[i] for i in ref_indices])
                # alt_nucleotides = ''.join([list(nucleotide_to_index.keys())[i] for i in alt_indices])
                
                # Print data
                print(f"SNP Data: {snp_data}")
                # print(f"Unpacked Genotypes: {unpacked_genotypes}")
                # print(f"Reference Nucleotides: {ref_nucleotides}")
                # print(f"Alternative Nucleotides: {alt_nucleotides}")
                print("\n")

if __name__ == "__main__":
    file_path = '/iblm/netapp/data4/jjaureguy/vcf_cuda/HaploHyped-VarAwareML/tests/out/ipscs.h5'
    read_and_verify_h5(file_path)
