22 failing tests

HallLab · Oct 7, 2020 · 6a4f4ca · 6a4f4ca
1 parent 6e6e4f7
commit 6a4f4ca
Show file tree

Hide file tree

Showing 12 changed files with 182 additions and 60 deletions.
diff --git a/.flake8 b/.flake8
@@ -6,5 +6,6 @@ exclude =
     dist,
     versioneer.py,
     plunk/_version.py,
-    docs/source/conf.py
+    docs/conf.py
+    tests/conftest.py
 max-line-length = 160
diff --git a/docs/conf.py b/docs/conf.py
@@ -102,4 +102,4 @@
         'relations.html',  # needs 'show_related': True theme option to display
         'searchbox.html',
     ]
-}
+}
diff --git a/pandas_genomics/__init__.py b/pandas_genomics/__init__.py
@@ -1,5 +1,5 @@
-from .dtypes.scalars import Genotype, Variant
-from .dtypes.genotype import GenotypeDtype, GenotypeArray
+from .scalars import Genotype, Variant
+from .arrays.genotype_array import GenotypeDtype, GenotypeArray
 
 __all__ = [
     'Genotype',
@@ -9,4 +9,4 @@
 ]
 
 # Simple version tracking for now until Poetry has a solution
-__version__ = "v0.1.0"
+__version__ = "v0.1.0"
diff --git a/pandas_genomics/dtypes/__init__.py → pandas_genomics/arrays/__init__.py b/pandas_genomics/dtypes/__init__.py → pandas_genomics/arrays/__init__.py
diff --git a/pandas_genomics/dtypes/genotype.py → pandas_genomics/arrays/genotype_array.py b/pandas_genomics/dtypes/genotype.py → pandas_genomics/arrays/genotype_array.py
@@ -4,11 +4,11 @@
 
 import numpy as np
 import pandas as pd
-from pandas.core.arrays import ExtensionArray, BooleanArray
+from pandas.core.arrays import ExtensionArray, BooleanArray, IntegerArray
 from pandas.core.dtypes.dtypes import register_extension_dtype, PandasExtensionDtype
 from pandas.core.dtypes.inference import is_list_like
 
-from .scalars import Variant, Genotype
+from pandas_genomics.scalars import Variant, Genotype
 
 
 @register_extension_dtype
@@ -64,7 +64,7 @@ def name(self) -> str:
     # ----
     def __init__(self, variant: Optional[Variant] = None):
         if variant is None:
-            variant = Variant.get_anonymous()
+            variant = Variant()
         self.variant = variant
 
     # ExtensionDtype Methods
@@ -295,7 +295,7 @@ def _from_sequence(cls, scalars, dtype: Optional[GenotypeDtype] = None, copy: bo
             Optional GenotypeDtype that must be compatible with the Genotypes
         copy : boolean, default False
             If True, copy the underlying data.
-        
+
         Returns
         -------
         GenotypeArray
@@ -328,7 +328,7 @@ def _from_sequence(cls, scalars, dtype: Optional[GenotypeDtype] = None, copy: bo
     @classmethod
     def _from_sequence_of_strings(cls, strings, dtype, copy: bool = False):
         """Construct a new ExtensionArray from a sequence of strings.
-        .. versionadded:: 0.24.0
+
         Parameters
         ----------
         strings : Sequence
@@ -337,6 +337,7 @@ def _from_sequence_of_strings(cls, strings, dtype, copy: bool = False):
             GenotypeDtype with variant information used to process the strings
         copy : boolean, default False
             If True, copy the underlying data.
+
         Returns
         -------
         GenotypeArray
@@ -414,9 +415,27 @@ def __setitem__(self, key: Union[int, np.ndarray], value: Union[Genotype, 'Genot
         if isinstance(value, list):
             # Convert list to genotype array, throwing an error if it doesn't work
             value = self._from_sequence(value)
-        # Handle pandas BooleanArray
+
+        # Validate the key
+        if isinstance(key, List):
+            key = pd.Series(key)
+            if key.isna().sum() > 0:
+                raise ValueError("Cannot index with an integer indexer containing NA values")
         if isinstance(key, BooleanArray):
+            # Convert to a normal boolean array after making NaN rows False
             key = key.fillna(False).astype('bool')
+        # Handle pandas IntegerArray
+        if isinstance(key, IntegerArray):
+            if key.isna().sum() > 0:
+                # Raise an error if there are any NA values
+                raise ValueError("Cannot index with an integer indexer containing NA values")
+            else:
+                # Convert to a regular numpy array of ints
+                key = key.astype('int')
+        # Ensure a mask doesn't have an incorrect length
+        if isinstance(key, np.ndarray) and key.dtype == 'bool':
+            if len(key) != len(self):
+                raise IndexError("wrong length")
         # Update allele values directly
         if isinstance(value, Genotype):
             self._data[key] = (value.allele1, value.allele2)
@@ -489,19 +508,19 @@ def unique(self) -> 'GenotypeArray':
         return GenotypeArray(values=unique_data, dtype=self.dtype)
 
     def value_counts(self, dropna=True):
-        raise NotImplementedError()
-
-    #def _values_for_factorize(self):
-    #    return self.astype(object), self.variant.make_genotype()
+        """Return a Series of unique counts with a GenotypeArray index"""
+        unique_data, unique_counts = np.unique(self._data, return_counts=True)
+        result = pd.Series(unique_counts, index=GenotypeArray(values=unique_data, dtype=self.dtype))
+        if dropna:
+            result = result.loc[result.index != self.dtype.na_value]
+        return result
 
     def astype(self, dtype, copy=True):
-        if isinstance(dtype, GenotypeDtype) or isinstance(dtype, object):
+        if isinstance(dtype, GenotypeDtype):
             if copy:
-                return self.copy()
-            else:
-                return self
-        else:
-            raise ValueError(f"Can't coerce GenotypeArray to 'dtype'")
+                self = self.copy()
+            return self
+        return super(GenotypeArray, self).astype(dtype)
 
     def isna(self):
         """
@@ -513,9 +532,11 @@ def isna(self):
     def _concat_same_type(cls, to_concat):
         """
         Concatenate multiple array
+
         Parameters
         ----------
         to_concat : sequence of this type
+
         Returns
         -------
         ExtensionArray

diff --git a/pandas_genomics/io.py b/pandas_genomics/io.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 import pandas as pd
-from .dtypes.genotype import GenotypeDtype, GenotypeArray, Variant
+from .arrays import GenotypeDtype, GenotypeArray
+from .scalars import Variant
 
 
 def from_plink(bed_file: str):

diff --git a/pandas_genomics/dtypes/scalars.py → pandas_genomics/scalars.py b/pandas_genomics/dtypes/scalars.py → pandas_genomics/scalars.py
@@ -1,43 +1,64 @@
-from typing import List, Optional
-from dataclasses import dataclass, field
+from typing import Optional, List
 
 
-@dataclass(order=True)
 class Variant:
     """
     Information about a variant.
 
     Parameters
     ----------
-    chromosome: str
-    coordinate: int
-        (1-based, 0 for none/unknown)
-    variant_id: str
-    alleles: List[str]
-        List of possible alleles
+    chromosome: str, optional
+        None by default, through this usually is not desired.
+    coordinate: int, optional
+        (1-based, the default is 0 for none/unknown)
+    variant_id: str, optional
+        None by default
+    alleles: List[str], optional
+        List of possible alleles, empty by default
 
     Examples
     --------
     >>> variant = Variant('12', 112161652, 'rs12462', alleles=['C', 'T'])
     >>> print(variant)
     rs12462[chr=12;pos=112161652;2 alleles]
     """
-    # Order by chromosome then coordinate for sorting reasons
-    chromosome: str
-    coordinate: int
-    variant_id: str
-    alleles: List[str] = field(default_factory=list)
+    def __init__(self,
+                 chromosome: Optional[str] = None,
+                 coordinate: int = 0,
+                 variant_id: Optional[str] = None,
+                 alleles: Optional[List[str]] = None):
+        self.chromosome = chromosome
+        self.coordinate = coordinate
+        self.variant_id = variant_id
+        # Start with standard alleles - any additional (ins/del) are added to the end for sorting purposes
+        self.alleles = ['A', 'C', 'G', 'T']
+        if alleles is not None:
+            self.alleles += [a for a in alleles if a not in self.alleles]
 
-    def __post_init__(self):
         # Validate the passed parameters
-        if ';' in self.variant_id or ',' in self.variant_id:
-            raise ValueError(f"The variant_id cannot contain ';' or ',': '{self.variant_id}'")
-        if ';' in self.chromosome or ',' in self.chromosome:
+        if self.chromosome is not None and (';' in self.chromosome or ',' in self.chromosome):
             raise ValueError(f"The chromosome cannot contain ';' or ',': '{self.chromosome}'")
-        if len(self.alleles) > 255:
-            raise ValueError(f"{len(self.alleles):,} alleles were provided, the maximum supported number is 255.")
         if self.coordinate > ((2 ** 31) - 2):
             raise ValueError(f"The coordinate value may not exceed 2^31-2, {self.coordinate:,} was specified")
+        if self.variant_id is not None and (';' in self.variant_id or ',' in self.variant_id):
+            raise ValueError(f"The variant_id cannot contain ';' or ',': '{self.variant_id}'")
+        if len(self.alleles) > 255:
+            raise ValueError(f"{len(self.alleles):,} alleles were provided, the maximum supported number is 255.")
+
+    def __str__(self):
+        return f"{self.variant_id}[chr={self.chromosome};pos={self.coordinate};{len(self.alleles)} alleles]"
+
+    def __repr__(self):
+        return f"Variant(chromosome={self.chromosome}, coordinate={self.coordinate}," \
+               f"variant_id={self.variant_id}, alleles={self.alleles})"
+
+    def __eq__(self, other):
+        if other.__class__ is not self.__class__:
+            return NotImplemented
+        return (self.chromosome == other.chromosome) &\
+               (self.coordinate == other.coordinate) &\
+               (self.variant_id == other.variant_id) & \
+               (self.alleles == other.alleles)
 
     def add_allele(self, allele):
         """
@@ -58,9 +79,6 @@ def add_allele(self, allele):
             raise ValueError(f"Couldn't add new allele to {self}, 255 alleles max.")
         print(self.alleles)
 
-    def __str__(self):
-        return f"{self.variant_id}[chr={self.chromosome};pos={self.coordinate};{len(self.alleles)} alleles]"
-
     def get_allele_idx(self, allele: Optional[str], add: bool = False) -> int:
         """
         Get the integer value for an allele based on this variant's list of alleles,
@@ -227,12 +245,7 @@ def make_genotype_from_plink_bits(self, plink_bits: str) -> 'Genotype':
 
         return Genotype(self, a1, a2)
 
-    @classmethod
-    def get_anonymous(cls):
-        return cls(chromosome="N/A", coordinate=0, variant_id="<ANONYMOUS>", alleles=[])
-
 
-@dataclass(order=True)
 class Genotype:
     """
     Genotype information associated with a specific variant.
@@ -241,7 +254,7 @@ class Genotype:
 
     Parameters
     ----------
-    variant: Variant
+    variant: pandas_genomics.scalars.variant.Variant
     allele1: int
         The first allele encoded as an index into the variant allele list
     allele2: int
@@ -258,11 +271,14 @@ class Genotype:
     >>> print(missing_genotype)
     <Missing>
     """
-    variant: Variant
-    allele1: int = 255
-    allele2: int = 255
+    def __init__(self,
+                 variant: Variant,
+                 allele1: int = 255,
+                 allele2: int = 255):
+        self.variant = variant
+        self.allele1 = allele1
+        self.allele2 = allele2
 
-    def __post_init__(self):
         # Sort allele1 and allele2
         if self.allele1 > self.allele2:
             a1, a2 = self.allele2, self.allele1
@@ -282,9 +298,45 @@ def __str__(self):
         elif self.allele1 != 255 and self.allele2 != 255:
             return f"{self.variant.alleles[self.allele1]}/{self.variant.alleles[self.allele2]}"
 
+    def __repr__(self):
+        return f"Genotype(variant={self.variant}, allele1={self.allele1}, allele2={self.allele2})"
+
     def __hash__(self):
         return hash(repr(self))
 
+    def __eq__(self, other):
+        if other.__class__ is not self.__class__:
+            return NotImplemented
+        return (self.variant == other.variant) & (self.allele1 == other.allele1) & (self.allele2 == other.allele2)
+
+    def __lt__(self, other):
+        if self.variant != other.variant:
+            raise NotImplementedError("Can't compare different variants")
+        else:
+            # Compare allele index values for sorting
+            # allele_1 is always <= allele_2 within a genotype
+            a1_lt = self.allele1 < other.allele1
+            a1_eq = self.allele1 == other.allele1
+            a2_lt = self.allele2 < other.allele2
+            return a1_lt | (a1_eq & a2_lt)
+
+    def __gt__(self, other):
+        if self.variant != other.variant:
+            raise NotImplementedError("Can't compare different variants")
+        else:
+            # Compare allele index values for sorting
+            # allele_1 is always <= allele_2 within a genotype
+            a1_gt = self.allele1 > other.allele1
+            a1_eq = self.allele1 == other.allele1
+            a2_gt = self.allele2 > other.allele2
+            return a1_gt | (a1_eq & a2_gt)
+
+    def __le__(self, other):
+        return (self < other) | (self == other)
+
+    def __ge__(self, other):
+        return (self > other) | (self == other)
+
     def is_missing(self) -> bool:
         """
         Returns

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,37 @@
+from pandas.tests.extension.conftest import *
+
+
+# Below fixtures are copied from pandas.conftest
+# They could be imported, but that would require having hypothesis as a dependency
+@pytest.fixture(params=[None, lambda x: x])
+def sort_by_key(request):
+    """
+    Simple fixture for testing keys in sorting methods.
+    Tests None (no key) and the identity key.
+    """
+    return request.param
+
+_all_arithmetic_operators = [
+    "__add__",
+    "__radd__",
+    "__sub__",
+    "__rsub__",
+    "__mul__",
+    "__rmul__",
+    "__floordiv__",
+    "__rfloordiv__",
+    "__truediv__",
+    "__rtruediv__",
+    "__pow__",
+    "__rpow__",
+    "__mod__",
+    "__rmod__",
+]
+
+
+@pytest.fixture(params=_all_arithmetic_operators)
+def all_arithmetic_operators(request):
+    """
+    Fixture for dunder names for common arithmetic operations.
+    """
+    return request.param
diff --git a/tests/genotype/conftest.py → tests/genotype_array/conftest.py b/tests/genotype/conftest.py → tests/genotype_array/conftest.py