Skip to content

Commit

Permalink
Additive and Dominant encoding are implemented and tested
Browse files Browse the repository at this point in the history
  • Loading branch information
jrm5100 committed Oct 15, 2020
1 parent edf7670 commit 7eda28a
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 5 deletions.
7 changes: 6 additions & 1 deletion pandas_genomics/accessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,9 @@ def set_reference(self, allele):
def encode_additive(self):
return pd.Series(data=self._array.encode_additive(),
index=self._index,
name=f"{self.array.variant.id}_{self.array.variant.alleles[1]}")
name=f"{self._array.variant.id}_{self._array.variant.alleles[1]}")

def encode_dominant(self):
return pd.Series(data=self._array.encode_dominant(),
index=self._index,
name=f"{self._array.variant.id}_{self._array.variant.alleles[1]}")
27 changes: 25 additions & 2 deletions pandas_genomics/arrays/genotype_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -709,7 +709,7 @@ def encode_additive(self) -> pd.arrays.IntegerArray:
pd.arrays.IntegerArray
0 for Homozygous Reference
1 for Heterozygous
2 for Homozygous
2 for Homozygous Alt
pd.NA for missing
Raises an error if there is more than 1 alternate allele
"""
Expand All @@ -719,5 +719,28 @@ def encode_additive(self) -> pd.arrays.IntegerArray:

allele_sum = self._data['allele1'] + self._data['allele2']
# Mask those > 2 which would result from a missing allele (255)
result = pd.arrays.IntegerArray(values=allele_sum, mask=(allele_sum > 2))
result = pd.array(data=[n if n <= 2 else None for n in allele_sum],
dtype='UInt8')
return result

def encode_dominant(self) -> pd.arrays.IntegerArray:
"""
Returns
-------
pd.arrays.IntegerArray
0 for Homozygous Reference
1 for Heterozygous
1 for Homozygous Alt
pd.NA for missing
Raises an error if there is more than 1 alternate allele
"""
# TODO: Return multiple arrays for multiple alternate alleles?
if len(self.variant.alleles) > 2:
raise ValueError("Additive encoding can only be used with one allele")

allele_sum = self._data['allele1'] + self._data['allele2']
# Heterozygous (sum=1) or Homozygous alt (sum=2) are 1
allele_sum[(allele_sum == 1) | (allele_sum == 2)] = 1
# Anything not 0 (homozygous ref) or 1 is None
result = pd.array(data=[n if n in {0, 1} else None for n in allele_sum], dtype='UInt8')
return result
14 changes: 14 additions & 0 deletions tests/genotype_array/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,3 +157,17 @@ def data_for_grouping():
c = variant.make_genotype('T', 'T')
na = variant.make_genotype()
return GenotypeArray([b, b, na, na, a, a, b, c])


@pytest.fixture
def data_for_encoding():
"""Data for encoding tests.
Contains one alt allele.
Variants are Homozygouse Ref, Heterozygous, Homozygous Alt, and Missing
"""
variant = Variant(chromosome='chr1', position=123456, id='rs12345', ref='A', alt=['T'])
a = variant.make_genotype('A', 'A')
b = variant.make_genotype('A', 'T')
c = variant.make_genotype('T', 'T')
na = variant.make_genotype()
return GenotypeArray([a, b, c, na])
24 changes: 22 additions & 2 deletions tests/genotype_array/test_GenotypeArray.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,30 @@
"""
Test GenotypeArray methods
"""
import pandas as pd
import pytest


# Custom Tests
@pytest.mark.xfail(raises=ValueError)
def test_encoding_additive(data):
def test_encoding_extra_alt(data):
"""Fail encoding when there are multiple alt alleles"""
data.encode_additive()


def test_encoding_additive(data_for_encoding):
expected = pd.array([0, 1, 2, None], dtype='UInt8')
result = data_for_encoding.encode_additive()
assert (result == expected).all()
# Test using series accessor
result_series = pd.Series(data_for_encoding).genotype.encode_additive()
assert (pd.Series(result) == result_series).all()


def test_encoding_dominant(data_for_encoding):
expected = pd.array([0, 1, 1, None], dtype='UInt8')
result = data_for_encoding.encode_dominant()
assert (result == expected).all()
# Test using series accessor
result_series = pd.Series(data_for_encoding).genotype.encode_dominant()
assert (pd.Series(result) == result_series).all()

0 comments on commit 7eda28a

Please sign in to comment.