Skip to content

Commit

Permalink
Finish VCF loading and add a test.
Browse files Browse the repository at this point in the history
  • Loading branch information
jrm5100 committed Oct 16, 2020
1 parent a5ba52f commit 061725e
Show file tree
Hide file tree
Showing 8 changed files with 334 additions and 258 deletions.
1 change: 1 addition & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ exclude =
plunk/_version.py,
docs/conf.py
tests/conftest.py
venv
max-line-length = 160
33 changes: 23 additions & 10 deletions pandas_genomics/io/vcf.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from pathlib import Path
from typing import Union

from cyvcf2 import VCF
import pandas as pd

from ..arrays import GenotypeDtype, GenotypeArray
from ..arrays import GenotypeArray
from ..scalars import Variant


def from_vcf(filename: str, min_qual: float = 0, drop_filtered: bool = True):
def from_vcf(filename: Union[str, Path], min_qual: float = 0, drop_filtered: bool = True):
"""
Load genetic data from a VCF or BCF file into a DataFrame
Expand All @@ -29,8 +29,10 @@ def from_vcf(filename: str, min_qual: float = 0, drop_filtered: bool = True):
Examples
--------
"""
genotype_array_list = []
for vcf_variant in VCF(filename): # or VCF('some.bcf')
from cyvcf2 import VCF # Import here since installing htslib on Windows is tricky

genotype_array_dict = dict()
for var_num, vcf_variant in enumerate(VCF(filename)): # or VCF('some.bcf')
# TODO: Should FILTER or QUAL be stored in the GenotypeArray?

# Skip filtered variants unless drop_filtered is True
Expand All @@ -41,12 +43,23 @@ def from_vcf(filename: str, min_qual: float = 0, drop_filtered: bool = True):
if vcf_variant.QUAL < min_qual:
continue

variant = Variant(chromosome = vcf_variant.CHROM,
position = vcf_variant.start,
# Make variant
variant = Variant(chromosome=vcf_variant.CHROM,
position=vcf_variant.start,
id=vcf_variant.ID,
ref=vcf_variant.REF,
alt=vcf_variant.ALT)
# Make the GenotypeArray
gt_array = GenotypeArray(values=[variant.make_genotype_from_vcf_record(vcf_record)
for vcf_record in vcf_variant.genotypes])
# Make the variant name
if gt_array.variant.id is None:
var_name = f"Variant_{var_num}"
else:
var_name = gt_array.variant.id

# Save to the dict
genotype_array_dict[var_name] = gt_array

alleles = vcf_variant.gt_bases
print()

df = pd.DataFrame.from_dict(genotype_array_dict)
return df
27 changes: 26 additions & 1 deletion pandas_genomics/scalars.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
Genotype
"""

from typing import Optional, List
from typing import Optional, List, Tuple

MISSING_IDX = 255 # Integer indicating a missing allele. Each variant must have 254 alleles max.

Expand Down Expand Up @@ -289,6 +289,31 @@ def make_genotype_from_plink_bits(self, plink_bits: str) -> 'Genotype':

return Genotype(self, a1, a2)

def make_genotype_from_vcf_record(self, vcf_record: Tuple[int, int, bool]) -> 'Genotype':
"""
Create a genotype from VCF records loaded as cyvcf2.VCF().genotypes
Parameters
----------
vcf_record: List[int, int, bool]
The list is [allele1_idx, allele2_idx, is_phased] where "-1" is missing.
Returns
-------
Genotype
A Genotype based on this variant with the specified alleles
"""
a1, a2, is_phased = vcf_record
if a1 == -1:
a1 = MISSING_IDX
if a2 == -1:
a2 = MISSING_IDX

assert self.is_valid_allele_idx(a1)
assert self.is_valid_allele_idx(a1)

return Genotype(self, a1, a2)


class Genotype:
"""
Expand Down

0 comments on commit 061725e

Please sign in to comment.