Skip to content

Commit

Permalink
Start adding VCF support
Browse files Browse the repository at this point in the history
  • Loading branch information
jrm5100 committed Oct 14, 2020
1 parent 6b6c4dd commit 262b9e5
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 4 deletions.
6 changes: 3 additions & 3 deletions pandas_genomics/io/plink.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ def from_plink(bed_file: str):
Returns
-------
DataFrame
Index columns include sample information.
Columns correspond to variants and rows correspond to samples.
Columns correspond to variants (named as {variant_number}_{variant ID}).
Rows correspond to samples and index columns include sample information.
Examples
--------
Expand Down Expand Up @@ -86,7 +86,7 @@ def from_plink(bed_file: str):
genotypes.extend([variant.make_genotype_from_plink_bits(bs) for bs in bitstrings])
# Remove nonexistent samples at the end
genotypes = genotypes[:num_samples]
df[variant_id] = GenotypeArray(values=genotypes, dtype=GenotypeDtype(variant))
df[f"{v_idx}_{variant_id}"] = GenotypeArray(values=genotypes, dtype=GenotypeDtype(variant))
print(f"\tLoaded genotypes from '{bed_file.name}'")

# Set sample info as the index
Expand Down
52 changes: 52 additions & 0 deletions pandas_genomics/io/vcf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from pathlib import Path

from cyvcf2 import VCF
import pandas as pd

from ..arrays import GenotypeDtype, GenotypeArray
from ..scalars import Variant


def from_vcf(filename: str, min_qual: float = 0, drop_filtered: bool = True):
"""
Load genetic data from a VCF or BCF file into a DataFrame
Parameters
----------
filename: str or Path
vcf, vcf.gz, or bcf file.
min_qual: float (default = 0)
Skip loading variants with less than this quality
drop_filtered: boolean (default = True)
Skip loading variants with a FILTER value other than "PASS"
Returns
-------
DataFrame
Columns correspond to variants (named as {variant_number}_{variant ID}).
Rows correspond to samples and index columns include sample information.
Examples
--------
"""
genotype_array_list = []
for vcf_variant in VCF(filename): # or VCF('some.bcf')
# TODO: Should FILTER or QUAL be stored in the GenotypeArray?

# Skip filtered variants unless drop_filtered is True
if vcf_variant.FILTER is not None and drop_filtered:
continue

# Skip variants below the minimum quality
if vcf_variant.QUAL < min_qual:
continue

variant = Variant(chromosome = vcf_variant.CHROM,
position = vcf_variant.start,
id=vcf_variant.ID,
ref=vcf_variant.REF,
alt=vcf_variant.ALT)

alleles = vcf_variant.gt_bases
print()

77 changes: 76 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 262b9e5

Please sign in to comment.