In [2]:
from io import BytesIO
import oxbow as ox
import pandas as pd
import polars as pl
import pyarrow as pa
import pysam
import numpy as np


This notebook is a proof of concept list of functions that allow us to flatten the INFO column of VCF files into a pandas DataFrame. First we define a set of functions which can parse the VCF file. In this notebook, we use Oxbow to read in the VCF and pysame to parse the header information that contains the mapping of the various key-value pairs in the INFO column. 

Below is an example VCF header. Note how each element of the INFO field is defined line by line. We need to read this into a mapping to parse that column.

```
##fileformat=VCFv4.2
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##INFO=<ID=CONFLICT,Number=.,Type=String,Description="Sample names for which there are multiple paths in the graph with conflicting alleles">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Total number of alternate alleles in called genotypes">
##INFO=<ID=AF,Number=A,Type=Float,Description="Estimated allele frequency in the range (0,1]">
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=LV,Number=1,Type=Integer,Description="Level in the snarl tree (0=top level)">
##INFO=<ID=PS,Number=1,Type=String,Description="ID of variant corresponding to parent snarl">
##INFO=<ID=AT,Number=R,Type=String,Description="Allele Traversal as path in graph">
##contig=<ID=NC_003197_v2#0#genome,length=4857450>
##contig=<ID=NC_003197_v2#genome,length=4857450>
```

In [None]:
# This dictionary maps the VCF-derived input types to python dtypes
TYPE_MAP = {

"""
This is a dictionary that maps generic type names used in the VCF INFO column to specific data typr

"""    "Integer": "int32",
    "
Float": "float64",
    "String": "object",
    "Flag": "bool",
}


def read_vcf_pandas(vcf_path):
    ipc = ox.read_vcf(vcf_path)
    return pa.ipc.open_file(BytesIO(ipc)).read_pandas()


def read_vcf_polars(vcf_path):
    ipc = ox.read_vcf(vcf_path)
    return pl.read_ipc(ipc)


def read_info_schema(vcf_pat
    h):
    """ 
    Read the schema of the INFO column of    We use pysam to do this.

    Parameters
    ----------
    vcf_path : str
        Path to bgzipped vcf, with tabix indexed .tbi file with similar name 
        in same folder.
    
    Returns
    ----------
      pandas DataFrame with columns ["name", "number", "type""]
    """
    with pysam.VariantFile(vcf_path) as f:
        return pd.DataFrame(
            [(obj.name, obj.number, obj.type) for obj in f.header.info.values()],
            columns=["name", "number", "type"],
        )
    

def flatten_info(df: pd.DataFrame, info_schema: pd.DataFrame) -> pd.DataFrame:
    """
    Flattens INFO column of a VCF by one

    Parameters
    ------
    df : pd.DataFrame 
    info_schema: pd.DataFrame

    Returns
    ------
    pd.DataFrame

    """
    type_series = info_schema.set_index("name")["type"]
    arity_series = info_schema.set_index("name")["number"]

    # Split the key-value pairs in the info column into dictionaries
    records = (
        df["info"]
            .str
            .split(";")
            .apply(
                lambda info_list: dict(x.split("=")                 
                if '=' in x else (x, True) 
                for x in info_list
            )
        )
    )

    # Convert to dataframe
    info_df = pd.DataFrame.from_records (records)

    # Split comma-separated values into lists
    for column in info_df.columns:
        if arity_series[column] not in {".", "1", "0"}:
            info_df[column] = info_df[column].str.split(",")

    # Convert to correct dtypes
    for column in info_df.columns:
        data_type = TYPE_MAP[type_series[column]]

        if arity_series[column] not in {".", "1", "0"}:
            # info_df[column] = info_df[column].apply(
            #     lambda x: x if x is pd.NA else [y for y in x]
            # )
            pass
        else:
            info_df[column] = info_df[column].astype(data_type)

    return info_df

Here is an example of these functions parsing a VCF's INFO column. A reproduction of the first few data lines of the VCF is below:

```
#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	DRR259112
NC_045512.2	241	.	C	T	4345.64	PASS	AC=1;AF=0.5;AN=2;BaseQRankSum=-0.372;DP=679;ExcessHet=0;FS=0.546;MLEAC=1;MLEAF=0.5;MQ=60;MQRankSum=0;QD=6.43;ReadPosRankSum=-0.585;SOR=0.653;EFF=INTERGENIC(MODIFIER||||||||||T)	GT:AD:DP:GQ:PL	0/1:445,231:676:99:4353,0,10881
NC_045512.2	316	.	TTT	TT	3583.6	PASS	AC=1;AF=0.5;AN=2;BaseQRankSum=1.07;DP=408;ExcessHet=0;FS=7.647;MLEAC=1;MLEAF=0.5;MQ=60;MQRankSum=0;QD=8.8;ReadPosRankSum=-2.848;SOR=1.3;EFF=FRAME_SHIFT(HIGH||ttg/|L18|179|leader|protein_coding|CODING|TRANSCRIPT_leader-gene|1|TT|WARNING_TRANSCRIPT_NO_STOP_CODON);LOF=(leader|leader-gene|1|1.00)	GT:AD:DP:GQ:PL	0/1:272,135:407:99:3591,0,8449
NC_045512.2	2662	.	C	T	1702.06	PASS	AC=2;AF=1;AN=2;DP=70;ExcessHet=0;FS=0;MLEAC=2;MLEAF=1;MQ=60;QD=24.32;SOR=0.75;EFF=SYNONYMOUS_CODING(LOW|SILENT|taC/taT|Y619|637|nsp2|protein_coding|CODING|TRANSCRIPT_nsp2-gene|1|T|WARNING_TRANSCRIPT_NO_START_CODON)	GT:AD:DP:GQ:PL	1/1:0,70:70:99:1716,209,0

```

In [28]:

vcf_path = "../DRR259112.ref.snpeff.vcf.gz"

df = read_vcf_pandas(vcf_path)
info_schema = read_info_schema(vcf_path)

info_df = flatten_info(df, info_schema)
info_df


[W::hts_idx_load3] The index file is older than the data file: ../DRR259112.ref.snpeff.vcf.gz.tbi


TypeError: 'float' object is not iterable

In [29]:

vcf_path = "../DRR.vcf.gz"

df = read_vcf_pandas(vcf_path)
info_schema = read_info_schema(vcf_path)

info_df = flatten_info(df, info_schema)
info_df


[W::hts_idx_load3] The index file is older than the data file: ../DRR.vcf.gz.tbi


Unnamed: 0,AC,AF,AN,AT,NS,LV
0,[12],[0.121212],[99],"[>8>10>11, >8<9>11]",[99],[0]
1,"[2, 5]","[0.020202, 0.0505051]",[99],"[>12>13>16, >12<15>16, >12<14>16]",[99],[0]
2,[6],[0.0606061],[99],"[>16>17>19, >16<18>19]",[99],[0]
3,[54],[0.545455],[99],"[>19>21>22, >19<20>22]",[99],[0]
4,"[2, 4]","[0.02, 0.04]",[100],"[>22>23>25>27, >22<24>25>27, >22>23<26>27]",[100],[0]
...,...,...,...,...,...,...
482,[2],[0.02],[100],"[>2561>2562>2564, >2561<2563>2564]",[100],[0]
483,[2],[0.02],[100],"[>2566>2567>2569, >2566<2568>2569]",[100],[0]
484,[20],[0.2],[100],"[>2569>2570>2572, >2569<2571>2572]",[100],[0]
485,[10],[0.1],[100],"[>2574>2575>2577, >2574<2576>2577]",[100],[0]
