Skip to content

Commit

Permalink
22 failing tests
Browse files Browse the repository at this point in the history
  • Loading branch information
jrm5100 committed Oct 7, 2020
1 parent 6e6e4f7 commit 6a4f4ca
Show file tree
Hide file tree
Showing 12 changed files with 182 additions and 60 deletions.
3 changes: 2 additions & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@ exclude =
dist,
versioneer.py,
plunk/_version.py,
docs/source/conf.py
docs/conf.py
tests/conftest.py
max-line-length = 160
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,4 +102,4 @@
'relations.html', # needs 'show_related': True theme option to display
'searchbox.html',
]
}
}
6 changes: 3 additions & 3 deletions pandas_genomics/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .dtypes.scalars import Genotype, Variant
from .dtypes.genotype import GenotypeDtype, GenotypeArray
from .scalars import Genotype, Variant
from .arrays.genotype_array import GenotypeDtype, GenotypeArray

__all__ = [
'Genotype',
Expand All @@ -9,4 +9,4 @@
]

# Simple version tracking for now until Poetry has a solution
__version__ = "v0.1.0"
__version__ = "v0.1.0"
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@

import numpy as np
import pandas as pd
from pandas.core.arrays import ExtensionArray, BooleanArray
from pandas.core.arrays import ExtensionArray, BooleanArray, IntegerArray
from pandas.core.dtypes.dtypes import register_extension_dtype, PandasExtensionDtype
from pandas.core.dtypes.inference import is_list_like

from .scalars import Variant, Genotype
from pandas_genomics.scalars import Variant, Genotype


@register_extension_dtype
Expand Down Expand Up @@ -64,7 +64,7 @@ def name(self) -> str:
# ----
def __init__(self, variant: Optional[Variant] = None):
if variant is None:
variant = Variant.get_anonymous()
variant = Variant()
self.variant = variant

# ExtensionDtype Methods
Expand Down Expand Up @@ -295,7 +295,7 @@ def _from_sequence(cls, scalars, dtype: Optional[GenotypeDtype] = None, copy: bo
Optional GenotypeDtype that must be compatible with the Genotypes
copy : boolean, default False
If True, copy the underlying data.
Returns
-------
GenotypeArray
Expand Down Expand Up @@ -328,7 +328,7 @@ def _from_sequence(cls, scalars, dtype: Optional[GenotypeDtype] = None, copy: bo
@classmethod
def _from_sequence_of_strings(cls, strings, dtype, copy: bool = False):
"""Construct a new ExtensionArray from a sequence of strings.
.. versionadded:: 0.24.0
Parameters
----------
strings : Sequence
Expand All @@ -337,6 +337,7 @@ def _from_sequence_of_strings(cls, strings, dtype, copy: bool = False):
GenotypeDtype with variant information used to process the strings
copy : boolean, default False
If True, copy the underlying data.
Returns
-------
GenotypeArray
Expand Down Expand Up @@ -414,9 +415,27 @@ def __setitem__(self, key: Union[int, np.ndarray], value: Union[Genotype, 'Genot
if isinstance(value, list):
# Convert list to genotype array, throwing an error if it doesn't work
value = self._from_sequence(value)
# Handle pandas BooleanArray

# Validate the key
if isinstance(key, List):
key = pd.Series(key)
if key.isna().sum() > 0:
raise ValueError("Cannot index with an integer indexer containing NA values")
if isinstance(key, BooleanArray):
# Convert to a normal boolean array after making NaN rows False
key = key.fillna(False).astype('bool')
# Handle pandas IntegerArray
if isinstance(key, IntegerArray):
if key.isna().sum() > 0:
# Raise an error if there are any NA values
raise ValueError("Cannot index with an integer indexer containing NA values")
else:
# Convert to a regular numpy array of ints
key = key.astype('int')
# Ensure a mask doesn't have an incorrect length
if isinstance(key, np.ndarray) and key.dtype == 'bool':
if len(key) != len(self):
raise IndexError("wrong length")
# Update allele values directly
if isinstance(value, Genotype):
self._data[key] = (value.allele1, value.allele2)
Expand Down Expand Up @@ -489,19 +508,19 @@ def unique(self) -> 'GenotypeArray':
return GenotypeArray(values=unique_data, dtype=self.dtype)

def value_counts(self, dropna=True):
raise NotImplementedError()

#def _values_for_factorize(self):
# return self.astype(object), self.variant.make_genotype()
"""Return a Series of unique counts with a GenotypeArray index"""
unique_data, unique_counts = np.unique(self._data, return_counts=True)
result = pd.Series(unique_counts, index=GenotypeArray(values=unique_data, dtype=self.dtype))
if dropna:
result = result.loc[result.index != self.dtype.na_value]
return result

def astype(self, dtype, copy=True):
if isinstance(dtype, GenotypeDtype) or isinstance(dtype, object):
if isinstance(dtype, GenotypeDtype):
if copy:
return self.copy()
else:
return self
else:
raise ValueError(f"Can't coerce GenotypeArray to 'dtype'")
self = self.copy()
return self
return super(GenotypeArray, self).astype(dtype)

def isna(self):
"""
Expand All @@ -513,9 +532,11 @@ def isna(self):
def _concat_same_type(cls, to_concat):
"""
Concatenate multiple array
Parameters
----------
to_concat : sequence of this type
Returns
-------
ExtensionArray
Expand Down
3 changes: 2 additions & 1 deletion pandas_genomics/io.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pathlib import Path
import pandas as pd
from .dtypes.genotype import GenotypeDtype, GenotypeArray, Variant
from .arrays import GenotypeDtype, GenotypeArray
from .scalars import Variant


def from_plink(bed_file: str):
Expand Down
118 changes: 85 additions & 33 deletions pandas_genomics/dtypes/scalars.py → pandas_genomics/scalars.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,64 @@
from typing import List, Optional
from dataclasses import dataclass, field
from typing import Optional, List


@dataclass(order=True)
class Variant:
"""
Information about a variant.
Parameters
----------
chromosome: str
coordinate: int
(1-based, 0 for none/unknown)
variant_id: str
alleles: List[str]
List of possible alleles
chromosome: str, optional
None by default, through this usually is not desired.
coordinate: int, optional
(1-based, the default is 0 for none/unknown)
variant_id: str, optional
None by default
alleles: List[str], optional
List of possible alleles, empty by default
Examples
--------
>>> variant = Variant('12', 112161652, 'rs12462', alleles=['C', 'T'])
>>> print(variant)
rs12462[chr=12;pos=112161652;2 alleles]
"""
# Order by chromosome then coordinate for sorting reasons
chromosome: str
coordinate: int
variant_id: str
alleles: List[str] = field(default_factory=list)
def __init__(self,
chromosome: Optional[str] = None,
coordinate: int = 0,
variant_id: Optional[str] = None,
alleles: Optional[List[str]] = None):
self.chromosome = chromosome
self.coordinate = coordinate
self.variant_id = variant_id
# Start with standard alleles - any additional (ins/del) are added to the end for sorting purposes
self.alleles = ['A', 'C', 'G', 'T']
if alleles is not None:
self.alleles += [a for a in alleles if a not in self.alleles]

def __post_init__(self):
# Validate the passed parameters
if ';' in self.variant_id or ',' in self.variant_id:
raise ValueError(f"The variant_id cannot contain ';' or ',': '{self.variant_id}'")
if ';' in self.chromosome or ',' in self.chromosome:
if self.chromosome is not None and (';' in self.chromosome or ',' in self.chromosome):
raise ValueError(f"The chromosome cannot contain ';' or ',': '{self.chromosome}'")
if len(self.alleles) > 255:
raise ValueError(f"{len(self.alleles):,} alleles were provided, the maximum supported number is 255.")
if self.coordinate > ((2 ** 31) - 2):
raise ValueError(f"The coordinate value may not exceed 2^31-2, {self.coordinate:,} was specified")
if self.variant_id is not None and (';' in self.variant_id or ',' in self.variant_id):
raise ValueError(f"The variant_id cannot contain ';' or ',': '{self.variant_id}'")
if len(self.alleles) > 255:
raise ValueError(f"{len(self.alleles):,} alleles were provided, the maximum supported number is 255.")

def __str__(self):
return f"{self.variant_id}[chr={self.chromosome};pos={self.coordinate};{len(self.alleles)} alleles]"

def __repr__(self):
return f"Variant(chromosome={self.chromosome}, coordinate={self.coordinate}," \
f"variant_id={self.variant_id}, alleles={self.alleles})"

def __eq__(self, other):
if other.__class__ is not self.__class__:
return NotImplemented
return (self.chromosome == other.chromosome) &\
(self.coordinate == other.coordinate) &\
(self.variant_id == other.variant_id) & \
(self.alleles == other.alleles)

def add_allele(self, allele):
"""
Expand All @@ -58,9 +79,6 @@ def add_allele(self, allele):
raise ValueError(f"Couldn't add new allele to {self}, 255 alleles max.")
print(self.alleles)

def __str__(self):
return f"{self.variant_id}[chr={self.chromosome};pos={self.coordinate};{len(self.alleles)} alleles]"

def get_allele_idx(self, allele: Optional[str], add: bool = False) -> int:
"""
Get the integer value for an allele based on this variant's list of alleles,
Expand Down Expand Up @@ -227,12 +245,7 @@ def make_genotype_from_plink_bits(self, plink_bits: str) -> 'Genotype':

return Genotype(self, a1, a2)

@classmethod
def get_anonymous(cls):
return cls(chromosome="N/A", coordinate=0, variant_id="<ANONYMOUS>", alleles=[])


@dataclass(order=True)
class Genotype:
"""
Genotype information associated with a specific variant.
Expand All @@ -241,7 +254,7 @@ class Genotype:
Parameters
----------
variant: Variant
variant: pandas_genomics.scalars.variant.Variant
allele1: int
The first allele encoded as an index into the variant allele list
allele2: int
Expand All @@ -258,11 +271,14 @@ class Genotype:
>>> print(missing_genotype)
<Missing>
"""
variant: Variant
allele1: int = 255
allele2: int = 255
def __init__(self,
variant: Variant,
allele1: int = 255,
allele2: int = 255):
self.variant = variant
self.allele1 = allele1
self.allele2 = allele2

def __post_init__(self):
# Sort allele1 and allele2
if self.allele1 > self.allele2:
a1, a2 = self.allele2, self.allele1
Expand All @@ -282,9 +298,45 @@ def __str__(self):
elif self.allele1 != 255 and self.allele2 != 255:
return f"{self.variant.alleles[self.allele1]}/{self.variant.alleles[self.allele2]}"

def __repr__(self):
return f"Genotype(variant={self.variant}, allele1={self.allele1}, allele2={self.allele2})"

def __hash__(self):
return hash(repr(self))

def __eq__(self, other):
if other.__class__ is not self.__class__:
return NotImplemented
return (self.variant == other.variant) & (self.allele1 == other.allele1) & (self.allele2 == other.allele2)

def __lt__(self, other):
if self.variant != other.variant:
raise NotImplementedError("Can't compare different variants")
else:
# Compare allele index values for sorting
# allele_1 is always <= allele_2 within a genotype
a1_lt = self.allele1 < other.allele1
a1_eq = self.allele1 == other.allele1
a2_lt = self.allele2 < other.allele2
return a1_lt | (a1_eq & a2_lt)

def __gt__(self, other):
if self.variant != other.variant:
raise NotImplementedError("Can't compare different variants")
else:
# Compare allele index values for sorting
# allele_1 is always <= allele_2 within a genotype
a1_gt = self.allele1 > other.allele1
a1_eq = self.allele1 == other.allele1
a2_gt = self.allele2 > other.allele2
return a1_gt | (a1_eq & a2_gt)

def __le__(self, other):
return (self < other) | (self == other)

def __ge__(self, other):
return (self > other) | (self == other)

def is_missing(self) -> bool:
"""
Returns
Expand Down
37 changes: 37 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from pandas.tests.extension.conftest import *


# Below fixtures are copied from pandas.conftest
# They could be imported, but that would require having hypothesis as a dependency
@pytest.fixture(params=[None, lambda x: x])
def sort_by_key(request):
"""
Simple fixture for testing keys in sorting methods.
Tests None (no key) and the identity key.
"""
return request.param

_all_arithmetic_operators = [
"__add__",
"__radd__",
"__sub__",
"__rsub__",
"__mul__",
"__rmul__",
"__floordiv__",
"__rfloordiv__",
"__truediv__",
"__rtruediv__",
"__pow__",
"__rpow__",
"__mod__",
"__rmod__",
]


@pytest.fixture(params=_all_arithmetic_operators)
def all_arithmetic_operators(request):
"""
Fixture for dunder names for common arithmetic operations.
"""
return request.param
File renamed without changes.

0 comments on commit 6a4f4ca

Please sign in to comment.