In [8]:
import dask
import dask.dataframe as dd
import numpy as np
import oxbow as ox
import pandas as pd
import polars as pl
import pyarrow as pa
import pyarrow.parquet

from simplevcf import read_vcf_as_pandas, read_vcf_as_polars, read_info_schema, read_sample_schema

### Introspect

In [9]:
read_info_schema("data/DRR.vcf.gz")

Unnamed: 0,name,number,type,description
0,CONFLICT,.,String,Sample names for which there are multiple path...
1,AC,A,Integer,Total number of alternate alleles in called ge...
2,AF,A,Float,"Estimated allele frequency in the range (0,1]"
3,NS,1,Integer,Number of samples with data
4,AN,1,Integer,Total number of alleles in called genotypes
5,LV,1,Integer,Level in the snarl tree (0=top level)
6,PS,1,String,ID of variant corresponding to parent snarl
7,AT,R,String,Allele Traversal as path in graph


In [10]:
read_sample_schema("data/DRR.vcf.gz")

Unnamed: 0,name,number,type,description
0,GT,1,String,Genotype


### Flatten and extract

In [11]:
read_vcf_as_pandas("data/DRR.vcf.gz", include_unspecified=False)

[W::vcf_parse_format] FORMAT 'AD' at genome:18 is not defined in the header, assuming Type=String


Unnamed: 0,chrom,pos,id,ref,alts,qual,filters,CONFLICT,AC,AF,NS,AN,LV,PS,AT,DRR452334.phased,DRR452334.GT
0,genome,18,>8>11,G,[T],60.0,[],,[12],[0.12121199816465378],99,99,0,,"[>8>10>11, >8<9>11]",True,[0]
1,genome,26,>12>16,T,"[G, C]",60.0,[],,"[2, 5]","[0.020201999694108963, 0.050505101680755615]",99,99,0,,"[>12>13>16, >12<15>16, >12<14>16]",True,[0]
2,genome,28,>16>19,A,[G],60.0,[],,[6],[0.06060609966516495],99,99,0,,"[>16>17>19, >16<18>19]",True,[0]
3,genome,30,>19>22,G,[A],60.0,[],,[54],[0.5454549789428711],99,99,0,,"[>19>21>22, >19<20>22]",True,[0]
4,genome,35,>22>27,GG,"[AG, GT]",60.0,[],,"[2, 4]","[0.019999999552965164, 0.03999999910593033]",100,100,0,,"[>22>23>25>27, >22<24>25>27, >22>23<26>27]",True,[0]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
482,genome,9544,>2561>2564,T,[C],60.0,[],,[2],[0.019999999552965164],100,100,0,,"[>2561>2562>2564, >2561<2563>2564]",True,[0]
483,genome,9592,>2566>2569,C,[T],60.0,[],,[2],[0.019999999552965164],100,100,0,,"[>2566>2567>2569, >2566<2568>2569]",True,[0]
484,genome,9598,>2569>2572,G,[A],60.0,[],,[20],[0.20000000298023224],100,100,0,,"[>2569>2570>2572, >2569<2571>2572]",True,[0]
485,genome,9634,>2574>2577,T,[C],60.0,[],,[10],[0.10000000149011612],100,100,0,,"[>2574>2575>2577, >2574<2576>2577]",True,[0]


In [12]:
read_vcf_as_pandas("data/DRR.vcf.gz", include_unspecified=True)

[W::vcf_parse_format] FORMAT 'AD' at genome:18 is not defined in the header, assuming Type=String


Unnamed: 0,chrom,pos,id,ref,alts,qual,filters,CONFLICT,AC,AF,NS,AN,LV,PS,AT,DRR452334.phased,DRR452334.GT,DRR452334.AD
0,genome,18,>8>11,G,[T],60.0,[],,[12],[0.12121199816465378],99,99,0,,"[>8>10>11, >8<9>11]",True,[0],"[70, 0]"
1,genome,26,>12>16,T,"[G, C]",60.0,[],,"[2, 5]","[0.020201999694108963, 0.050505101680755615]",99,99,0,,"[>12>13>16, >12<15>16, >12<14>16]",True,[0],"[67, 0, 0]"
2,genome,28,>16>19,A,[G],60.0,[],,[6],[0.06060609966516495],99,99,0,,"[>16>17>19, >16<18>19]",True,[0],"[65, 1]"
3,genome,30,>19>22,G,[A],60.0,[],,[54],[0.5454549789428711],99,99,0,,"[>19>21>22, >19<20>22]",True,[0],"[64, 0]"
4,genome,35,>22>27,GG,"[AG, GT]",60.0,[],,"[2, 4]","[0.019999999552965164, 0.03999999910593033]",100,100,0,,"[>22>23>25>27, >22<24>25>27, >22>23<26>27]",True,[0],"[62, 0, 0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
482,genome,9544,>2561>2564,T,[C],60.0,[],,[2],[0.019999999552965164],100,100,0,,"[>2561>2562>2564, >2561<2563>2564]",True,[0],"[122, 0]"
483,genome,9592,>2566>2569,C,[T],60.0,[],,[2],[0.019999999552965164],100,100,0,,"[>2566>2567>2569, >2566<2568>2569]",True,[0],"[122, 0]"
484,genome,9598,>2569>2572,G,[A],60.0,[],,[20],[0.20000000298023224],100,100,0,,"[>2569>2570>2572, >2569<2571>2572]",True,[0],"[125, 0]"
485,genome,9634,>2574>2577,T,[C],60.0,[],,[10],[0.10000000149011612],100,100,0,,"[>2574>2575>2577, >2574<2576>2577]",True,[0],"[119, 0]"


In [13]:
df = read_vcf_as_polars("data/DRR.vcf.gz", include_unspecified=False)
df

[W::vcf_parse_format] FORMAT 'AD' at genome:18 is not defined in the header, assuming Type=String


chrom,pos,id,ref,alts,qual,filters,CONFLICT,AC,AF,NS,AN,LV,PS,AT,DRR452334.phased,DRR452334.GT
str,i64,str,str,list[str],f64,list[null],null,list[i64],list[f64],i64,i64,i64,null,list[str],bool,list[i64]
"""genome""",18,""">8>11""","""G""","[""T""]",60.0,[],,[12],[0.121212],99,99,0,,"["">8>10>11"", "">8<9>11""]",true,[0]
"""genome""",26,""">12>16""","""T""","[""G"", ""C""]",60.0,[],,"[2, 5]","[0.020202, 0.050505]",99,99,0,,"["">12>13>16"", "">12<15>16"", "">12<14>16""]",true,[0]
"""genome""",28,""">16>19""","""A""","[""G""]",60.0,[],,[6],[0.060606],99,99,0,,"["">16>17>19"", "">16<18>19""]",true,[0]
"""genome""",30,""">19>22""","""G""","[""A""]",60.0,[],,[54],[0.545455],99,99,0,,"["">19>21>22"", "">19<20>22""]",true,[0]
"""genome""",35,""">22>27""","""GG""","[""AG"", ""GT""]",60.0,[],,"[2, 4]","[0.02, 0.04]",100,100,0,,"["">22>23>25>27"", "">22<24>25>27"", "">22>23<26>27""]",true,[0]
"""genome""",38,""">27>30""","""G""","[""T""]",60.0,[],,[2],[0.020202],99,99,0,,"["">27>28>30"", "">27<29>30""]",true,[0]
"""genome""",46,""">33>36""","""T""","[""C""]",60.0,[],,[2],[0.02],100,100,0,,"["">33>34>36"", "">33<35>36""]",true,[0]
"""genome""",51,""">36>39""","""A""","[""C""]",60.0,[],,[2],[0.020202],99,99,0,,"["">36>37>39"", "">36<38>39""]",true,[0]
"""genome""",81,""">41>44""","""G""","[""A""]",60.0,[],,[11],[0.11],100,100,0,,"["">41>43>44"", "">41<42>44""]",true,[0]
"""genome""",124,""">48>51""","""G""","[""T""]",60.0,[],,[4],[0.04],100,100,0,,"["">48>49>51"", "">48<50>51""]",true,[0]


### Explode

In [14]:
schema = read_info_schema("data/DRR.vcf.gz")
schema

Unnamed: 0,name,number,type,description
0,CONFLICT,.,String,Sample names for which there are multiple path...
1,AC,A,Integer,Total number of alternate alleles in called ge...
2,AF,A,Float,"Estimated allele frequency in the range (0,1]"
3,NS,1,Integer,Number of samples with data
4,AN,1,Integer,Total number of alleles in called genotypes
5,LV,1,Integer,Level in the snarl tree (0=top level)
6,PS,1,String,ID of variant corresponding to parent snarl
7,AT,R,String,Allele Traversal as path in graph


In [17]:
df = read_vcf_as_polars("data/DRR.vcf.gz", include_unspecified=True)
df

[W::vcf_parse_format] FORMAT 'AD' at genome:18 is not defined in the header, assuming Type=String


chrom,pos,id,ref,alts,qual,filters,CONFLICT,AC,AF,NS,AN,LV,PS,AT,DRR452334.phased,DRR452334.GT,DRR452334.AD
str,i64,str,str,list[str],f64,list[null],null,list[i64],list[f64],i64,i64,i64,null,list[str],bool,list[i64],list[str]
"""genome""",18,""">8>11""","""G""","[""T""]",60.0,[],,[12],[0.121212],99,99,0,,"["">8>10>11"", "">8<9>11""]",true,[0],"[""70"", ""0""]"
"""genome""",26,""">12>16""","""T""","[""G"", ""C""]",60.0,[],,"[2, 5]","[0.020202, 0.050505]",99,99,0,,"["">12>13>16"", "">12<15>16"", "">12<14>16""]",true,[0],"[""67"", ""0"", ""0""]"
"""genome""",28,""">16>19""","""A""","[""G""]",60.0,[],,[6],[0.060606],99,99,0,,"["">16>17>19"", "">16<18>19""]",true,[0],"[""65"", ""1""]"
"""genome""",30,""">19>22""","""G""","[""A""]",60.0,[],,[54],[0.545455],99,99,0,,"["">19>21>22"", "">19<20>22""]",true,[0],"[""64"", ""0""]"
"""genome""",35,""">22>27""","""GG""","[""AG"", ""GT""]",60.0,[],,"[2, 4]","[0.02, 0.04]",100,100,0,,"["">22>23>25>27"", "">22<24>25>27"", "">22>23<26>27""]",true,[0],"[""62"", ""0"", ""0""]"
"""genome""",38,""">27>30""","""G""","[""T""]",60.0,[],,[2],[0.020202],99,99,0,,"["">27>28>30"", "">27<29>30""]",true,[0],"[""61"", ""0""]"
"""genome""",46,""">33>36""","""T""","[""C""]",60.0,[],,[2],[0.02],100,100,0,,"["">33>34>36"", "">33<35>36""]",true,[0],"[""63"", ""0""]"
"""genome""",51,""">36>39""","""A""","[""C""]",60.0,[],,[2],[0.020202],99,99,0,,"["">36>37>39"", "">36<38>39""]",true,[0],"[""68"", ""0""]"
"""genome""",81,""">41>44""","""G""","[""A""]",60.0,[],,[11],[0.11],100,100,0,,"["">41>43>44"", "">41<42>44""]",true,[0],"[""79"", ""0""]"
"""genome""",124,""">48>51""","""G""","[""T""]",60.0,[],,[4],[0.04],100,100,0,,"["">48>49>51"", "">48<50>51""]",true,[0],"[""71"", ""1""]"


In [18]:
df.explode(["alts", "AC", "AF"])

chrom,pos,id,ref,alts,qual,filters,CONFLICT,AC,AF,NS,AN,LV,PS,AT,DRR452334.phased,DRR452334.GT,DRR452334.AD
str,i64,str,str,str,f64,list[null],null,i64,f64,i64,i64,i64,null,list[str],bool,list[i64],list[str]
"""genome""",18,""">8>11""","""G""","""T""",60.0,[],,12,0.121212,99,99,0,,"["">8>10>11"", "">8<9>11""]",true,[0],"[""70"", ""0""]"
"""genome""",26,""">12>16""","""T""","""G""",60.0,[],,2,0.020202,99,99,0,,"["">12>13>16"", "">12<15>16"", "">12<14>16""]",true,[0],"[""67"", ""0"", ""0""]"
"""genome""",26,""">12>16""","""T""","""C""",60.0,[],,5,0.050505,99,99,0,,"["">12>13>16"", "">12<15>16"", "">12<14>16""]",true,[0],"[""67"", ""0"", ""0""]"
"""genome""",28,""">16>19""","""A""","""G""",60.0,[],,6,0.060606,99,99,0,,"["">16>17>19"", "">16<18>19""]",true,[0],"[""65"", ""1""]"
"""genome""",30,""">19>22""","""G""","""A""",60.0,[],,54,0.545455,99,99,0,,"["">19>21>22"", "">19<20>22""]",true,[0],"[""64"", ""0""]"
"""genome""",35,""">22>27""","""GG""","""AG""",60.0,[],,2,0.02,100,100,0,,"["">22>23>25>27"", "">22<24>25>27"", "">22>23<26>27""]",true,[0],"[""62"", ""0"", ""0""]"
"""genome""",35,""">22>27""","""GG""","""GT""",60.0,[],,4,0.04,100,100,0,,"["">22>23>25>27"", "">22<24>25>27"", "">22>23<26>27""]",true,[0],"[""62"", ""0"", ""0""]"
"""genome""",38,""">27>30""","""G""","""T""",60.0,[],,2,0.020202,99,99,0,,"["">27>28>30"", "">27<29>30""]",true,[0],"[""61"", ""0""]"
"""genome""",46,""">33>36""","""T""","""C""",60.0,[],,2,0.02,100,100,0,,"["">33>34>36"", "">33<35>36""]",true,[0],"[""63"", ""0""]"
"""genome""",51,""">36>39""","""A""","""C""",60.0,[],,2,0.020202,99,99,0,,"["">36>37>39"", "">36<38>39""]",true,[0],"[""68"", ""0""]"


### Merging

In [19]:
info_fields = []
df1 = read_vcf_as_pandas("data/DRR259112.ref.snpeff.vcf", info_fields=info_fields, sample_fields=["GT"])
df2 = read_vcf_as_pandas("data/DRR259113.ref.snpeff.vcf.gz", info_fields=info_fields, sample_fields=["GT"])
df1["alts"] = df1["alts"].apply(tuple)
df2["alts"] = df2["alts"].apply(tuple)

In [21]:
pd.merge(
    df1.drop(columns=["qual", "id", "filters"]),
    df2.drop(columns=["qual", "id", "filters"]),
    how="inner",
    on=["chrom", "pos", "ref", "alts"],
)

Unnamed: 0,chrom,pos,ref,alts,DRR259112.phased,DRR259112.GT,DRR259113.phased,DRR259113.GT
0,NC_045512.2,2662,C,"(T,)",False,"[1, 1]",False,"[1, 1]"
1,NC_045512.2,8782,C,"(T,)",False,"[0, 1]",False,"[1, 1]"
2,NC_045512.2,28144,T,"(C,)",False,"[0, 1]",False,"[1, 1]"
3,NC_045512.2,29095,C,"(T,)",False,"[0, 1]",False,"[1, 1]"


### Consolidate

In [None]:
vcf_files = [
    "s3://sra-pub-sars-cov2/vcf/DRR259112/DRR259112.ref.snpeff.vcf",
    "s3://sra-pub-sars-cov2/vcf/DRR259113/DRR259113.ref.snpeff.vcf",
    "s3://sra-pub-sars-cov2/vcf/DRR259114/DRR259114.ref.snpeff.vcf",
    "s3://sra-pub-sars-cov2/vcf/DRR272391/DRR272391.ref.snpeff.vcf",
    "s3://sra-pub-sars-cov2/vcf/DRR272392/DRR272392.ref.snpeff.vcf",
    "s3://sra-pub-sars-cov2/vcf/DRR272393/DRR272393.ref.snpeff.vcf",
    "s3://sra-pub-sars-cov2/vcf/DRR272394/DRR272394.ref.snpeff.vcf",
    "s3://sra-pub-sars-cov2/vcf/DRR272395/DRR272395.ref.snpeff.vcf",
    "s3://sra-pub-sars-cov2/vcf/DRR272396/DRR272396.ref.snpeff.vcf",
    "s3://sra-pub-sars-cov2/vcf/DRR272397/DRR272397.ref.snpeff.vcf",
]


def load_chunk(uri: str):
    df = read_vcf_as_pandas(uri, info_fields=["DP", "ExcessHet"], sample_fields=["GT"])

    # Add a column for the run accession
    sra_accession = uri.split("/")[-2]
    df.insert(0, "run", sra_accession)

    # IDs are not annotated in these files so we'll drop them
    df.drop(columns=["id"], inplace=True)

    # Rename remove the accession from the GT column name
    df.rename(columns={f"{sra_accession}.GT": "GT"}, inplace=True)
    df.rename(columns={f"{sra_accession}.phased": "phased"}, inplace=True)

    return df


chunks = [
    dask.delayed(load_chunk)(uri)
    for uri in vcf_files
]

df = dd.from_delayed(chunks)

In [None]:
df.map_partitions(len).compute()

In [None]:
df.partitions[0].compute()

In [None]:
df.repartition(2).to_parquet("testdir.parquet", write_index=False)

In [None]:
pd.read_parquet("testdir.parquet")

In [None]:
dd.read_parquet("testdir.parquet").partitions[0].compute()