Skip to content

Commit

Permalink
Merge pull request #8 from jrm5100/5-df-accessor
Browse files Browse the repository at this point in the history
5 df accessor
  • Loading branch information
jrm5100 committed Mar 31, 2021
2 parents 20911b5 + 2291e8b commit 269159c
Show file tree
Hide file tree
Showing 15 changed files with 446 additions and 118 deletions.
4 changes: 2 additions & 2 deletions pandas_genomics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
import importlib_metadata

from . import arrays, io, scalars
from .accessors import GenotypeAccessor
from .accessors import GenotypeSeriesAccessor, GenotypeDataframeAccessor

__version__ = importlib_metadata.version(__name__)

__all__ = [__version__, GenotypeAccessor, arrays, io, scalars]
__all__ = [__version__, GenotypeSeriesAccessor, GenotypeDataframeAccessor, io, scalars]
2 changes: 2 additions & 0 deletions pandas_genomics/accessors/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .series_accessor import GenotypeSeriesAccessor
from .dataframe_accessor import GenotypeDataframeAccessor
123 changes: 123 additions & 0 deletions pandas_genomics/accessors/dataframe_accessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
from typing import Optional

import pandas as pd

from pandas_genomics.arrays import GenotypeDtype


@pd.api.extensions.register_dataframe_accessor("genomics")
class GenotypeDataframeAccessor:
"""
DataFrame accessor for GenotypeArray methods
"""

def __init__(self, pandas_obj):
for colname in pandas_obj.columns:
if not GenotypeDtype.is_dtype(pandas_obj[colname].values.dtype):
raise AttributeError(
f"Incompatible datatype: column {colname} is '{pandas_obj[colname].dtype}',"
f" but must be a GenotypeDtype"
)
self._obj = pandas_obj

######################
# Variant Properties #
######################
@property
def variant_info(self) -> pd.DataFrame:
"""Return a DataFrame with variant info indexed by the column name"""
return pd.DataFrame.from_dict(
{
colname: self._obj[colname].genomics.variant_info
for colname in self._obj.columns
},
orient="index",
)

#########################
# Calculated Properties #
#########################
@property
def maf(self):
"""Return the minor allele frequency
See :py:attr:`GenotypeArray.maf`"""
return pd.Series(
{col: self._obj[col].genomics.maf for col in self._obj.columns}
)

############
# Encoding #
############
def encode_additive(self) -> pd.DataFrame:
"""Additive encoding of genotypes.
See :meth:`GenotypeArray.encode_additive`
Returns
-------
pd.DataFrame
"""
return pd.concat(
[self._obj[col].genomics.encode_additive() for col in self._obj.columns],
axis=1,
)

def encode_dominant(self) -> pd.DataFrame:
"""Dominant encoding of genotypes.
See :meth:`GenotypeArray.encode_dominant`
Returns
-------
pd.DataFrame
"""
return pd.concat(
[self._obj[col].genomics.encode_dominant() for col in self._obj.columns],
axis=1,
)

def encode_recessive(self) -> pd.DataFrame:
"""Recessive encoding of genotypes.
See :meth:`GenotypeArray.encode_recessive`
Returns
-------
pd.DataFrame
"""
return pd.concat(
[self._obj[col].genomics.encode_recessive() for col in self._obj.columns],
axis=1,
)

def encode_codominant(self) -> pd.DataFrame:
"""Codominant encoding of genotypes.
See :meth:`GenotypeArray.encode_codominant`
Returns
-------
pd.DataFrame
"""
return pd.concat(
[self._obj[col].genomics.encode_codominant() for col in self._obj.columns],
axis=1,
)

###########
# Filters #
###########
def filter_maf(self, keep_min_freq: Optional[float] = None) -> pd.DataFrame:
"""
Drop variants with a MAF less than the specified value (0.01 by default)
"""
if keep_min_freq is None:
keep_min_freq = 0.01
return self._obj.drop(
columns=[
c
for c in self._obj.columns
if self._obj[c].genomics.maf < keep_min_freq
]
)
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
from pandas_genomics.arrays import GenotypeDtype


@pd.api.extensions.register_series_accessor("genotype")
class GenotypeAccessor:
@pd.api.extensions.register_series_accessor("genomics")
class GenotypeSeriesAccessor:
"""
Series accessor for GenotypeArray methods
"""
Expand All @@ -26,14 +26,23 @@ def _wrap_method(self, method, *args, **kwargs):
####################
@property
def variant(self):
"""Retrieve the variant
"""Retrieve the variant object
Returns
-------
variant: Variant
"""
return self._array.variant

@property
def variant_info(self):
"""Retrieve the variant as a pandas Series
Returns
-------
variant: pd.Series"""
return pd.Series(self._array.variant.as_dict(), name=self._name)

#######################
# Genotype Properties #
#######################
Expand All @@ -44,6 +53,16 @@ def gt_scores(self):
"""
return self._array.gt_scores

#########################
# Calculated Properties #
#########################
@property
def maf(self):
"""Return the minor allele frequency
See :py:attr:`GenotypeArray.maf`"""
return self._array.maf

####################
# In-place methods #
####################
Expand Down Expand Up @@ -125,3 +144,7 @@ def encode_codominant(self) -> pd.Series:
index=self._index,
name=f"{self._array.variant.id}_{self._array.variant.alleles[1]}",
)

##############
# QC Methods #
##############
97 changes: 97 additions & 0 deletions pandas_genomics/arrays/encoding_mixin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import numpy as np
import pandas as pd

from pandas_genomics.scalars import MISSING_IDX


class EncodingMixin:
"""
Genotype Mixin containing functions for performing encoding
"""

def encode_additive(self) -> pd.arrays.IntegerArray:
"""
Returns
-------
pd.arrays.IntegerArray
Number of copies of the minor allele
pd.NA when any alleles are missing
Raises ValueError if there is more than 1 alternate allele
"""
# TODO: Return multiple arrays for multiple alternate alleles?
if len(self.variant.alleles) > 2:
raise ValueError("Additive encoding can only be used with one allele")

allele_sum = self.allele_idxs.sum(axis=1).astype("float")
allele_sum[(self.allele_idxs == MISSING_IDX).any(axis=1)] = np.nan
result = pd.array(data=allele_sum, dtype="UInt8")
return result

def encode_dominant(self) -> pd.arrays.IntegerArray:
"""
Returns
-------
pd.arrays.IntegerArray
0 for no copies of the minor allele
1 for any copies of the minor allele
pd.NA when any alleles are missing
Raises an error if there is more than 1 alternate allele
"""
# TODO: Return multiple arrays for multiple alternate alleles?
if len(self.variant.alleles) > 2:
raise ValueError("Dominant encoding can only be used with one allele")

has_minor = (self.allele_idxs == 1).any(axis=1).astype("float")
has_minor[(self.allele_idxs == MISSING_IDX).any(axis=1)] = np.nan
result = pd.array(data=has_minor, dtype="UInt8")
return result

def encode_recessive(self) -> pd.arrays.IntegerArray:
"""
Returns
-------
pd.arrays.IntegerArray
1 for Homozygous Alt
0 for anything else
pd.NA when any alleles are missing
Raises an error if there is more than 1 alternate allele
"""
# TODO: Return multiple arrays for multiple alternate alleles?
if len(self.variant.alleles) > 2:
raise ValueError("Recessive encoding can only be used with one allele")

all_minor = (self.allele_idxs == 1).all(axis=1).astype("float")
all_minor[(self.allele_idxs == MISSING_IDX).any(axis=1)] = np.nan
result = pd.array(data=all_minor, dtype="UInt8")
return result

def encode_codominant(self) -> pd.arrays.Categorical:
"""
This encodes the genotype into three categories. When utilized in regression, this results in two variables
due to dummy encoding- "Het" as 0 or 1 and "Hom" as 0 or 1. 0 in both indicates "Ref".
Returns
-------
pd.arrays.Categorical
'Ref' for Homozygous Reference
'Het' for Heterozygous
'Hom' for Homozygous Alt
pd.NA for missing
Raises an error if there is more than 1 alternate allele or ploidy is not 2
"""
# TODO: Return multiple arrays for multiple alternate alleles?
if len(self.variant.alleles) > 2:
raise ValueError("Codominant encoding can only be used with one allele")
if self.dtype.variant.ploidy != 2:
raise ValueError(
"Codominant encoding can only be used with diploid genotypes"
)

allele_sum = self.allele_idxs.sum(axis=1)
categories = ["Ref", "Het", "Hom"]
result = pd.Categorical(
values=[categories[n] if n in {0, 1, 2} else None for n in allele_sum],
categories=categories,
ordered=True,
)
return result

0 comments on commit 269159c

Please sign in to comment.