Finish VCF loading and add a test.

HallLab · Oct 16, 2020 · 061725e · 061725e
1 parent a5ba52f
commit 061725e
Show file tree

Hide file tree

Showing 8 changed files with 334 additions and 258 deletions.
diff --git a/.flake8 b/.flake8
@@ -8,4 +8,5 @@ exclude =
     plunk/_version.py,
     docs/conf.py
     tests/conftest.py
+    venv
 max-line-length = 160
diff --git a/pandas_genomics/io/vcf.py b/pandas_genomics/io/vcf.py
@@ -1,13 +1,13 @@
 from pathlib import Path
+from typing import Union
 
-from cyvcf2 import VCF
 import pandas as pd
 
-from ..arrays import GenotypeDtype, GenotypeArray
+from ..arrays import GenotypeArray
 from ..scalars import Variant
 
 
-def from_vcf(filename: str, min_qual: float = 0, drop_filtered: bool = True):
+def from_vcf(filename: Union[str, Path], min_qual: float = 0, drop_filtered: bool = True):
     """
     Load genetic data from a VCF or BCF file into a DataFrame
 
@@ -29,8 +29,10 @@ def from_vcf(filename: str, min_qual: float = 0, drop_filtered: bool = True):
     Examples
     --------
     """
-    genotype_array_list = []
-    for vcf_variant in VCF(filename):  # or VCF('some.bcf')
+    from cyvcf2 import VCF  # Import here since installing htslib on Windows is tricky
+
+    genotype_array_dict = dict()
+    for var_num, vcf_variant in enumerate(VCF(filename)):  # or VCF('some.bcf')
         # TODO: Should FILTER or QUAL be stored in the GenotypeArray?
 
         # Skip filtered variants unless drop_filtered is True
@@ -41,12 +43,23 @@ def from_vcf(filename: str, min_qual: float = 0, drop_filtered: bool = True):
         if vcf_variant.QUAL < min_qual:
             continue
 
-        variant = Variant(chromosome = vcf_variant.CHROM,
-                          position = vcf_variant.start,
+        # Make variant
+        variant = Variant(chromosome=vcf_variant.CHROM,
+                          position=vcf_variant.start,
                           id=vcf_variant.ID,
                           ref=vcf_variant.REF,
                           alt=vcf_variant.ALT)
+        # Make the GenotypeArray
+        gt_array = GenotypeArray(values=[variant.make_genotype_from_vcf_record(vcf_record)
+                                         for vcf_record in vcf_variant.genotypes])
+        # Make the variant name
+        if gt_array.variant.id is None:
+            var_name = f"Variant_{var_num}"
+        else:
+            var_name = gt_array.variant.id
+
+        # Save to the dict
+        genotype_array_dict[var_name] = gt_array
 
-        alleles = vcf_variant.gt_bases
-        print()
-
+    df = pd.DataFrame.from_dict(genotype_array_dict)
+    return df
diff --git a/pandas_genomics/scalars.py b/pandas_genomics/scalars.py
@@ -10,7 +10,7 @@
      Genotype
 """
 
-from typing import Optional, List
+from typing import Optional, List, Tuple
 
 MISSING_IDX = 255  # Integer indicating a missing allele.  Each variant must have 254 alleles max.
 
@@ -289,6 +289,31 @@ def make_genotype_from_plink_bits(self, plink_bits: str) -> 'Genotype':
 
         return Genotype(self, a1, a2)
 
+    def make_genotype_from_vcf_record(self, vcf_record: Tuple[int, int, bool]) -> 'Genotype':
+        """
+        Create a genotype from VCF records loaded as cyvcf2.VCF().genotypes
+
+        Parameters
+        ----------
+        vcf_record: List[int, int, bool]
+            The list is [allele1_idx, allele2_idx, is_phased] where "-1" is missing.
+
+        Returns
+        -------
+        Genotype
+            A Genotype based on this variant with the specified alleles
+        """
+        a1, a2, is_phased = vcf_record
+        if a1 == -1:
+            a1 = MISSING_IDX
+        if a2 == -1:
+            a2 = MISSING_IDX
+
+        assert self.is_valid_allele_idx(a1)
+        assert self.is_valid_allele_idx(a1)
+
+        return Genotype(self, a1, a2)
+
 
 class Genotype:
     """