In [1]:
from __future__ import annotations
from io import BytesIO

import numpy as np
import oxbow as ox
import pandas as pd
import polars as pl
import pyarrow as pa
import pysam

In [4]:
# This dictionary maps the VCF-derived input types to numpy/pandas dtypes
TYPE_MAP = {
    "Integer": "int64",
    "Float": "float64",
    "String": "object",
    "Flag": "bool",
}


def read_vcf_as_pandas(vcf_path: str) -> pd.DataFrame:
    ipc = ox.read_vcf(vcf_path)
    return pa.ipc.open_file(BytesIO(ipc)).read_pandas()


def read_vcf_as_polars(vcf_path: str) -> pl.DataFrame:
    ipc = ox.read_vcf(vcf_path)
    return pl.read_ipc(ipc)


def read_info_schema(vcf_path: str) -> pd.DataFrame:
    """ 
    Read the schema of the INFO column of a VCF.

    This currently uses pysam.

    Parameters
    ----------
    vcf_path : str
        Path to bgzipped vcf, with tabix indexed .tbi file with similar name 
        in same folder.
    
    Returns
    -------
    pd.DataFrame
        Dataframe with [name, number, type] columns

    Notes
    -----
    Possible values for `type` are: "Integer", "Float", "String", "Flag".

    Possible values for `number` are:
        - An integer (e.g. 0, 1, 2, 3, 4, etc.) - for fields where the number
          of values per VCF record is fixed. 0 means the field is a "Flag".
        - A string ("A", "G", "R") - for fields where the number of
          values per VCF record is determined by the number of alts, the total
          number of alleles, or the number of genotypes, respectively.
        - A dot (".") - for fields where the number of values per VCF record
          varies, is unknown, or is unbounded.
    """
    with pysam.VariantFile(vcf_path) as f:
        return pd.DataFrame(
            [(obj.name, obj.number, obj.type) for obj in f.header.info.values()],
            columns=["name", "number", "type"],
        )


In [5]:
import pysam

In [6]:
f = pysam.VariantFile("DRR.vcf.gz")


<pysam.libcbcf.VariantFile at 0x10b27fa30>

In [7]:
f.header

<pysam.libcbcf.VariantHeader at 0x10b16e9d0>

In [10]:
[x for x in dir(f.header) if not x.startswith('__')]


['add_line',
 'add_meta',
 'add_record',
 'add_sample',
 'add_samples',
 'alts',
 'contigs',
 'copy',
 'filters',
 'formats',
 'info',
 'merge',
 'new_record',
 'records',
 'samples',
 'version']

In [14]:
f.header.contigs.keys(), f.header.contigs.values()

contig = f.header.contigs.values()[0]
contig.id, contig.length, contig.name

(0, 4857450, 'NC_003197_v2#0#genome')

In [17]:
f.header.filters.keys(), f.header.filters.values()
filter = f.header.filters.values()[0]
filter.description, filter.id, filter.number, filter.name

('All filters passed', 0, None, 'PASS')

In [20]:
f.header.info.keys(), f.header.info.values()
info_field = f.header.info.values()[0]
info_field.name, info_field.number, info_field.type, info_field.description

('CONFLICT',
 '.',
 'String',
 'Sample names for which there are multiple paths in the graph with conflicting alleles')

In [24]:
list(f.header.samples)

['DRR452334']