Skip to content

Commit

Permalink
Update tests to separate base tests from custom method tests. Add a m…
Browse files Browse the repository at this point in the history
…ethod to change the reference allele so that plink files can be loaded with either allele as the reference. Add a parameter to limit the number of loaded variants.
  • Loading branch information
jrm5100 committed Oct 15, 2020
1 parent 3b7ea99 commit 11be084
Show file tree
Hide file tree
Showing 7 changed files with 281 additions and 197 deletions.
44 changes: 43 additions & 1 deletion pandas_genomics/arrays/genotype_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -645,6 +645,47 @@ def __ge__(self, other):
a2_eq = self._data['allele2'] == allele2
return a1_gt | (a1_eq & a2_gt) | (a1_eq & a2_eq)

#####################
# Utility Functions #
#####################
def set_reference(self, allele: Union[str, int]) -> None:
"""
Change the reference allele (in-place) by specifying an allele index value or an allele string
Parameters
----------
allele: int or str
The allele that will be set as the reference allele.
Either the allele string, or the index into the variant allele list
Returns
-------
None
"""
# Get the allele as an integer and as a string
if type(allele) == str:
allele_idx = self.variant.get_allele_idx(allele, add=False)
allele_str = allele
elif type(allele) == int:
if not self.variant.is_valid_allele_idx(allele):
raise ValueError(f"{allele} is not a valid allele index,"
f" the variant has {len(self.variant.alleles)} alleles.")
allele_idx = allele
allele_str = self.variant.alleles[allele]
else:
raise ValueError(f"The `allele` must be a str or int, not an instance of '{type(allele)}'")

if allele_idx == 0:
# Nothing to do, this is already the reference
return

# Update the list of alleles
old_ref = self.variant.alleles[0]
# Replace existing value with the old ref
self.variant.alleles[allele_idx] = old_ref
# Add new ref to the beginning and remove the old ref
self.variant.alleles = [allele_str, ] + self.variant.alleles[1:]

######################
# Encoding Functions #
######################
Expand All @@ -666,5 +707,6 @@ def encode_additive(self) -> pd.arrays.IntegerArray:

allele_sum = self._data['allele1'] + self._data['allele2']
# Mask those > 2 which would result from a missing allele (255)
result = pd.arrays.IntegerArray(values=allele_sum, mask=(allele_sum > 2))
result = pd.arrays.IntegerArray(name=f"{self.variant.id}_{self.variant.alleles[1]}",
values=allele_sum, mask=(allele_sum > 2))
return result
27 changes: 24 additions & 3 deletions pandas_genomics/io/plink.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,26 @@
from pathlib import Path
from typing import Optional

import pandas as pd
from ..arrays import GenotypeDtype, GenotypeArray
from ..scalars import Variant


def from_plink(bed_file: str):
def from_plink(bed_file: str,
swap_alleles: bool = False,
max_variants: Optional[int] = None):
"""
Load genetic data from plink files (.bed, .bim, and .fam) into a DataFrame
Load genetic data from plink files (.bed, .bim, and .fam) into a DataFrame.
Parameters
----------
bed_file: str or Path
PLINK .bed file. .bim and .fam files with the same name and location must also exist.
swap_alleles: bool
False by default, in which case "allele1" in the bim file is considered the "reference" allele.
If True, "allele2" is considered the "reference" allele.
max_variants: Optional[int]
If provided, only load this number of variants
Returns
-------
Expand Down Expand Up @@ -55,6 +64,14 @@ def from_plink(bed_file: str):
# chromosome is a category
variant_info['chromosome'] = variant_info['chromosome'].astype('category')
num_variants = len(variant_info)

# Limit num_variants
if max_variants is not None:
if max_variants < 1:
raise ValueError(f"'max_variants' set to an invalid value: {max_variants}")
else:
num_variants = max_variants

print(f"\tLoaded information for {num_variants} variants from '{bim_file.name}'")

# Load bed file (PLINK binary biallelic genotype table) and add info to the df
Expand Down Expand Up @@ -86,7 +103,11 @@ def from_plink(bed_file: str):
genotypes.extend([variant.make_genotype_from_plink_bits(bs) for bs in bitstrings])
# Remove nonexistent samples at the end
genotypes = genotypes[:num_samples]
df[f"{v_idx}_{variant_id}"] = GenotypeArray(values=genotypes, dtype=GenotypeDtype(variant))
gt_array = GenotypeArray(values=genotypes, dtype=GenotypeDtype(variant))
# Set allele2 as the reference if 'swap_alleles'
if swap_alleles:
gt_array.set_reference(a2)
df[f"{v_idx}_{variant_id}"] = gt_array
print(f"\tLoaded genotypes from '{bed_file.name}'")

# Set sample info as the index
Expand Down
99 changes: 99 additions & 0 deletions tests/genotype_array/conftest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
import random

import pytest

from pandas_genomics.arrays import GenotypeDtype, GenotypeArray
from pandas_genomics.scalars import Variant

random.seed(1855)


# Below fixtures are copied from pandas.conftest
# They could be imported, but that would require having hypothesis as a dependency
Expand Down Expand Up @@ -57,3 +64,95 @@ def all_boolean_reductions(request):
Fixture for boolean reduction names.
"""
return request.param

# Implement the required fixtures
@pytest.fixture
def dtype():
variant = Variant(chromosome='chr1', position=123456, id='rs12345', ref='A', alt=['T', 'G'])
return GenotypeDtype(variant=variant)


@pytest.fixture
def data():
"""Length-100 array for this type.
* data[0] and data[1] should both be non missing
* data[0] and data[1] should not be equal
"""
alleles = ['A', 'T', 'G']
variant = Variant(chromosome='chr1', position=123456, id='rs12345', ref='A', alt=['T', 'G'])
genotypes = [variant.make_genotype('A', 'T'), variant.make_genotype('T', 'T')]
for i in range(98):
genotypes.append(variant.make_genotype(random.choice(alleles), random.choice(alleles)))
return GenotypeArray(values=genotypes)


@pytest.fixture
def data_for_twos():
"""Length-100 array in which all the elements are two."""
# Not applicable
raise NotImplementedError


@pytest.fixture
def data_missing():
"""Length-2 array with [NA, Valid]"""
variant = Variant(chromosome='chr1', position=123456, id='rs12345', ref='A', alt=['T', 'G'])
genotypes = [variant.make_genotype(), variant.make_genotype('T', 'T')]
return GenotypeArray(values=genotypes)


@pytest.fixture
def data_for_sorting():
"""Length-3 array with a known sort order.
This should be three items [B, C, A] with
A < B < C
"""
variant = Variant(chromosome='chr1', position=123456, id='rs12345', ref='A', alt=['T', 'G'])
a = variant.make_genotype('A', 'A')
b = variant.make_genotype('A', 'T')
c = variant.make_genotype('T', 'T')
return GenotypeArray(values=[b, c, a])


@pytest.fixture
def data_missing_for_sorting():
"""Length-3 array with a known sort order.
This should be three items [B, NA, A] with
A < B and NA missing.
"""
variant = Variant(chromosome='chr1', position=123456, id='rs12345', ref='A', alt=['T', 'G'])
a = variant.make_genotype('A', 'A')
b = variant.make_genotype('A', 'T')
na = variant.make_genotype()
return GenotypeArray(values=[b, na, a])


@pytest.fixture
def na_cmp():
"""Binary operator for comparing NA values.
Should return a function of two arguments that returns
True if both arguments are (scalar) NA for your type.
By default, uses ``operator.is_``
"""
return lambda gt1, gt2: gt1.is_missing() and gt2.is_missing()


@pytest.fixture
def na_value():
"""The scalar missing value for this type. Default 'None'"""
variant = Variant(chromosome='chr1', position=123456, id='rs12345', ref='A', alt=['T', 'G'])
return variant.make_genotype()


@pytest.fixture
def data_for_grouping():
"""Data for factorization, grouping, and unique tests.
Expected to be like [B, B, NA, NA, A, A, B, C]
Where A < B < C and NA is missing
"""
variant = Variant(chromosome='chr1', position=123456, id='rs12345', ref='A', alt=['T', 'G'])
a = variant.make_genotype('A', 'A')
b = variant.make_genotype('A', 'T')
c = variant.make_genotype('T', 'T')
na = variant.make_genotype()
return GenotypeArray([b, b, na, na, a, a, b, c])
96 changes: 96 additions & 0 deletions tests/genotype_array/test_ExtensionArray.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""
Run ExtensionArray tests from Pandas on the GenotypeArray class
"""

from pandas.tests.extension import base


# Run the predefined tests
class TestCasting(base.BaseCastingTests):
pass


class TestConstructors(base.BaseConstructorsTests):
pass


class TestDtype(base.BaseDtypeTests):
pass


class TestGetItem(base.BaseGetitemTests):
pass


class TestGroupBy(base.BaseGroupbyTests):
pass


class TestInterface(base.BaseInterfaceTests):
pass


class TestParsing(base.BaseParsingTests):
pass


class TestMethods(base.BaseMethodsTests):

def test_combine_add(self, data_repeated):
"""Addition of Genotypes isn't valid"""
pass

def test_searchsorted(self, data_for_sorting, as_series):
# TODO: Can't pass until it's possible to define dtype as scalar (See Pandas GH #33825)
pass

def test_where_series(self, data, na_value, as_frame):
# TODO: Can't pass until it's possible to define dtype as scalar (See Pandas GH #33825)
pass


class TestMissing(base.BaseMissingTests):
pass


# Skip ArithmeticOps since they aren't valid
# class TestArithmeticOps(base.BaseArithmeticOpsTests):
# pass


class TestComparisonOps(base.BaseComparisonOpsTests):
pass


class TestOpsUtil(base.BaseOpsUtil):
pass

# No way to invert a genotype
# class TestUnaryOps(base.BaseUnaryOpsTests):
# pass


class TestPrinting(base.BasePrintingTests):
pass


# No boolean equivalent for genotypes
# class TestBooleanReduce(base.BaseBooleanReduceTests):
# pass


class TestNoReduce(base.BaseNoReduceTests):
pass


# No numeric equivalent for genotypes
# class TestNumericReduce(base.BaseNumericReduceTests):
# pass


class TestReshaping(base.BaseReshapingTests):
pass


class TestSetitems(base.BaseSetitemTests):
pass

0 comments on commit 11be084

Please sign in to comment.