Update tests to separate base tests from custom method tests. Add a m…

…ethod to change the reference allele so that plink files can be loaded with either allele as the reference. Add a parameter to limit the number of loaded variants.
HallLab · Oct 15, 2020 · 11be084 · 11be084
1 parent 3b7ea99
commit 11be084
Show file tree

Hide file tree

Showing 7 changed files with 281 additions and 197 deletions.
diff --git a/pandas_genomics/arrays/genotype_array.py b/pandas_genomics/arrays/genotype_array.py
@@ -645,6 +645,47 @@ def __ge__(self, other):
         a2_eq = self._data['allele2'] == allele2
         return a1_gt | (a1_eq & a2_gt) | (a1_eq & a2_eq)
 
+    #####################
+    # Utility Functions #
+    #####################
+    def set_reference(self, allele: Union[str, int]) -> None:
+        """
+        Change the reference allele (in-place) by specifying an allele index value or an allele string
+
+        Parameters
+        ----------
+        allele: int or str
+            The allele that will be set as the reference allele.
+            Either the allele string, or the index into the variant allele list
+
+        Returns
+        -------
+        None
+        """
+        # Get the allele as an integer and as a string
+        if type(allele) == str:
+            allele_idx = self.variant.get_allele_idx(allele, add=False)
+            allele_str = allele
+        elif type(allele) == int:
+            if not self.variant.is_valid_allele_idx(allele):
+                raise ValueError(f"{allele} is not a valid allele index,"
+                                 f" the variant has {len(self.variant.alleles)} alleles.")
+            allele_idx = allele
+            allele_str = self.variant.alleles[allele]
+        else:
+            raise ValueError(f"The `allele` must be a str or int, not an instance of '{type(allele)}'")
+
+        if allele_idx == 0:
+            # Nothing to do, this is already the reference
+            return
+
+        # Update the list of alleles
+        old_ref = self.variant.alleles[0]
+        # Replace existing value with the old ref
+        self.variant.alleles[allele_idx] = old_ref
+        # Add new ref to the beginning and remove the old ref
+        self.variant.alleles = [allele_str, ] + self.variant.alleles[1:]
+
     ######################
     # Encoding Functions #
     ######################
@@ -666,5 +707,6 @@ def encode_additive(self) -> pd.arrays.IntegerArray:
 
         allele_sum = self._data['allele1'] + self._data['allele2']
         # Mask those > 2 which would result from a missing allele (255)
-        result = pd.arrays.IntegerArray(values=allele_sum, mask=(allele_sum > 2))
+        result = pd.arrays.IntegerArray(name=f"{self.variant.id}_{self.variant.alleles[1]}",
+                                        values=allele_sum, mask=(allele_sum > 2))
         return result
diff --git a/pandas_genomics/io/plink.py b/pandas_genomics/io/plink.py
@@ -1,17 +1,26 @@
 from pathlib import Path
+from typing import Optional
+
 import pandas as pd
 from ..arrays import GenotypeDtype, GenotypeArray
 from ..scalars import Variant
 
 
-def from_plink(bed_file: str):
+def from_plink(bed_file: str,
+               swap_alleles: bool = False,
+               max_variants: Optional[int] = None):
     """
-    Load genetic data from plink files (.bed, .bim, and .fam) into a DataFrame
+    Load genetic data from plink files (.bed, .bim, and .fam) into a DataFrame.
 
     Parameters
     ----------
     bed_file: str or Path
         PLINK .bed file.  .bim and .fam files with the same name and location must also exist.
+    swap_alleles: bool
+        False by default, in which case "allele1" in the bim file is considered the "reference" allele.
+        If True, "allele2" is considered the "reference" allele.
+    max_variants: Optional[int]
+        If provided, only load this number of variants
 
     Returns
     -------
@@ -55,6 +64,14 @@ def from_plink(bed_file: str):
     # chromosome is a category
     variant_info['chromosome'] = variant_info['chromosome'].astype('category')
     num_variants = len(variant_info)
+
+    # Limit num_variants
+    if max_variants is not None:
+        if max_variants < 1:
+            raise ValueError(f"'max_variants' set to an invalid value: {max_variants}")
+        else:
+            num_variants = max_variants
+
     print(f"\tLoaded information for {num_variants} variants from '{bim_file.name}'")
 
     # Load bed file (PLINK binary biallelic genotype table) and add info to the df
@@ -86,7 +103,11 @@ def from_plink(bed_file: str):
                 genotypes.extend([variant.make_genotype_from_plink_bits(bs) for bs in bitstrings])
             # Remove nonexistent samples at the end
             genotypes = genotypes[:num_samples]
-            df[f"{v_idx}_{variant_id}"] = GenotypeArray(values=genotypes, dtype=GenotypeDtype(variant))
+            gt_array = GenotypeArray(values=genotypes, dtype=GenotypeDtype(variant))
+            # Set allele2 as the reference if 'swap_alleles'
+            if swap_alleles:
+                gt_array.set_reference(a2)
+            df[f"{v_idx}_{variant_id}"] = gt_array
     print(f"\tLoaded genotypes from '{bed_file.name}'")
 
     # Set sample info as the index

diff --git a/tests/genotype_array/conftest.py b/tests/genotype_array/conftest.py
@@ -1,5 +1,12 @@
+import random
+
 import pytest
 
+from pandas_genomics.arrays import GenotypeDtype, GenotypeArray
+from pandas_genomics.scalars import Variant
+
+random.seed(1855)
+
 
 # Below fixtures are copied from pandas.conftest
 # They could be imported, but that would require having hypothesis as a dependency
@@ -57,3 +64,95 @@ def all_boolean_reductions(request):
     Fixture for boolean reduction names.
     """
     return request.param
+
+# Implement the required fixtures
+@pytest.fixture
+def dtype():
+    variant = Variant(chromosome='chr1', position=123456, id='rs12345', ref='A', alt=['T', 'G'])
+    return GenotypeDtype(variant=variant)
+
+
+@pytest.fixture
+def data():
+    """Length-100 array for this type.
+    * data[0] and data[1] should both be non missing
+    * data[0] and data[1] should not be equal
+    """
+    alleles = ['A', 'T', 'G']
+    variant = Variant(chromosome='chr1', position=123456, id='rs12345', ref='A', alt=['T', 'G'])
+    genotypes = [variant.make_genotype('A', 'T'), variant.make_genotype('T', 'T')]
+    for i in range(98):
+        genotypes.append(variant.make_genotype(random.choice(alleles), random.choice(alleles)))
+    return GenotypeArray(values=genotypes)
+
+
+@pytest.fixture
+def data_for_twos():
+    """Length-100 array in which all the elements are two."""
+    # Not applicable
+    raise NotImplementedError
+
+
+@pytest.fixture
+def data_missing():
+    """Length-2 array with [NA, Valid]"""
+    variant = Variant(chromosome='chr1', position=123456, id='rs12345', ref='A', alt=['T', 'G'])
+    genotypes = [variant.make_genotype(), variant.make_genotype('T', 'T')]
+    return GenotypeArray(values=genotypes)
+
+
+@pytest.fixture
+def data_for_sorting():
+    """Length-3 array with a known sort order.
+    This should be three items [B, C, A] with
+    A < B < C
+    """
+    variant = Variant(chromosome='chr1', position=123456, id='rs12345', ref='A', alt=['T', 'G'])
+    a = variant.make_genotype('A', 'A')
+    b = variant.make_genotype('A', 'T')
+    c = variant.make_genotype('T', 'T')
+    return GenotypeArray(values=[b, c, a])
+
+
+@pytest.fixture
+def data_missing_for_sorting():
+    """Length-3 array with a known sort order.
+    This should be three items [B, NA, A] with
+    A < B and NA missing.
+    """
+    variant = Variant(chromosome='chr1', position=123456, id='rs12345', ref='A', alt=['T', 'G'])
+    a = variant.make_genotype('A', 'A')
+    b = variant.make_genotype('A', 'T')
+    na = variant.make_genotype()
+    return GenotypeArray(values=[b, na, a])
+
+
+@pytest.fixture
+def na_cmp():
+    """Binary operator for comparing NA values.
+    Should return a function of two arguments that returns
+    True if both arguments are (scalar) NA for your type.
+    By default, uses ``operator.is_``
+    """
+    return lambda gt1, gt2: gt1.is_missing() and gt2.is_missing()
+
+
+@pytest.fixture
+def na_value():
+    """The scalar missing value for this type. Default 'None'"""
+    variant = Variant(chromosome='chr1', position=123456, id='rs12345', ref='A', alt=['T', 'G'])
+    return variant.make_genotype()
+
+
+@pytest.fixture
+def data_for_grouping():
+    """Data for factorization, grouping, and unique tests.
+    Expected to be like [B, B, NA, NA, A, A, B, C]
+    Where A < B < C and NA is missing
+    """
+    variant = Variant(chromosome='chr1', position=123456, id='rs12345', ref='A', alt=['T', 'G'])
+    a = variant.make_genotype('A', 'A')
+    b = variant.make_genotype('A', 'T')
+    c = variant.make_genotype('T', 'T')
+    na = variant.make_genotype()
+    return GenotypeArray([b, b, na, na, a, a, b, c])
diff --git a/tests/genotype_array/test_ExtensionArray.py b/tests/genotype_array/test_ExtensionArray.py
@@ -0,0 +1,96 @@
+"""
+Run ExtensionArray tests from Pandas on the GenotypeArray class
+"""
+
+from pandas.tests.extension import base
+
+
+# Run the predefined tests
+class TestCasting(base.BaseCastingTests):
+    pass
+
+
+class TestConstructors(base.BaseConstructorsTests):
+    pass
+
+
+class TestDtype(base.BaseDtypeTests):
+    pass
+
+
+class TestGetItem(base.BaseGetitemTests):
+    pass
+
+
+class TestGroupBy(base.BaseGroupbyTests):
+    pass
+
+
+class TestInterface(base.BaseInterfaceTests):
+    pass
+
+
+class TestParsing(base.BaseParsingTests):
+    pass
+
+
+class TestMethods(base.BaseMethodsTests):
+
+    def test_combine_add(self, data_repeated):
+        """Addition of Genotypes isn't valid"""
+        pass
+
+    def test_searchsorted(self, data_for_sorting, as_series):
+        # TODO: Can't pass until it's possible to define dtype as scalar (See Pandas GH #33825)
+        pass
+
+    def test_where_series(self, data, na_value, as_frame):
+        # TODO: Can't pass until it's possible to define dtype as scalar (See Pandas GH #33825)
+        pass
+
+
+class TestMissing(base.BaseMissingTests):
+    pass
+
+
+# Skip ArithmeticOps since they aren't valid
+# class TestArithmeticOps(base.BaseArithmeticOpsTests):
+#     pass
+
+
+class TestComparisonOps(base.BaseComparisonOpsTests):
+    pass
+
+
+class TestOpsUtil(base.BaseOpsUtil):
+    pass
+
+# No way to invert a genotype
+# class TestUnaryOps(base.BaseUnaryOpsTests):
+#     pass
+
+
+class TestPrinting(base.BasePrintingTests):
+    pass
+
+
+# No boolean equivalent for genotypes
+# class TestBooleanReduce(base.BaseBooleanReduceTests):
+#     pass
+
+
+class TestNoReduce(base.BaseNoReduceTests):
+    pass
+
+
+# No numeric equivalent for genotypes
+# class TestNumericReduce(base.BaseNumericReduceTests):
+#     pass
+
+
+class TestReshaping(base.BaseReshapingTests):
+    pass
+
+
+class TestSetitems(base.BaseSetitemTests):
+    pass