Merge pull request #8 from jrm5100/5-df-accessor

5 df accessor
HallLab · Mar 31, 2021 · 269159c · 269159c
2 parents 20911b5 + 2291e8b
commit 269159c
Show file tree

Hide file tree

Showing 15 changed files with 446 additions and 118 deletions.
diff --git a/pandas_genomics/__init__.py b/pandas_genomics/__init__.py
@@ -4,8 +4,8 @@
     import importlib_metadata
 
 from . import arrays, io, scalars
-from .accessors import GenotypeAccessor
+from .accessors import GenotypeSeriesAccessor, GenotypeDataframeAccessor
 
 __version__ = importlib_metadata.version(__name__)
 
-__all__ = [__version__, GenotypeAccessor, arrays, io, scalars]
+__all__ = [__version__, GenotypeSeriesAccessor, GenotypeDataframeAccessor, io, scalars]
diff --git a/pandas_genomics/accessors/__init__.py b/pandas_genomics/accessors/__init__.py
@@ -0,0 +1,2 @@
+from .series_accessor import GenotypeSeriesAccessor
+from .dataframe_accessor import GenotypeDataframeAccessor
diff --git a/pandas_genomics/accessors/dataframe_accessor.py b/pandas_genomics/accessors/dataframe_accessor.py
@@ -0,0 +1,123 @@
+from typing import Optional
+
+import pandas as pd
+
+from pandas_genomics.arrays import GenotypeDtype
+
+
+@pd.api.extensions.register_dataframe_accessor("genomics")
+class GenotypeDataframeAccessor:
+    """
+    DataFrame accessor for GenotypeArray methods
+    """
+
+    def __init__(self, pandas_obj):
+        for colname in pandas_obj.columns:
+            if not GenotypeDtype.is_dtype(pandas_obj[colname].values.dtype):
+                raise AttributeError(
+                    f"Incompatible datatype: column {colname}  is '{pandas_obj[colname].dtype}',"
+                    f" but must be a GenotypeDtype"
+                )
+        self._obj = pandas_obj
+
+    ######################
+    # Variant Properties #
+    ######################
+    @property
+    def variant_info(self) -> pd.DataFrame:
+        """Return a DataFrame with variant info indexed by the column name"""
+        return pd.DataFrame.from_dict(
+            {
+                colname: self._obj[colname].genomics.variant_info
+                for colname in self._obj.columns
+            },
+            orient="index",
+        )
+
+    #########################
+    # Calculated Properties #
+    #########################
+    @property
+    def maf(self):
+        """Return the minor allele frequency
+
+        See :py:attr:`GenotypeArray.maf`"""
+        return pd.Series(
+            {col: self._obj[col].genomics.maf for col in self._obj.columns}
+        )
+
+    ############
+    # Encoding #
+    ############
+    def encode_additive(self) -> pd.DataFrame:
+        """Additive encoding of genotypes.
+
+        See :meth:`GenotypeArray.encode_additive`
+
+        Returns
+        -------
+        pd.DataFrame
+        """
+        return pd.concat(
+            [self._obj[col].genomics.encode_additive() for col in self._obj.columns],
+            axis=1,
+        )
+
+    def encode_dominant(self) -> pd.DataFrame:
+        """Dominant encoding of genotypes.
+
+        See :meth:`GenotypeArray.encode_dominant`
+
+        Returns
+        -------
+        pd.DataFrame
+        """
+        return pd.concat(
+            [self._obj[col].genomics.encode_dominant() for col in self._obj.columns],
+            axis=1,
+        )
+
+    def encode_recessive(self) -> pd.DataFrame:
+        """Recessive encoding of genotypes.
+
+        See :meth:`GenotypeArray.encode_recessive`
+
+        Returns
+        -------
+        pd.DataFrame
+        """
+        return pd.concat(
+            [self._obj[col].genomics.encode_recessive() for col in self._obj.columns],
+            axis=1,
+        )
+
+    def encode_codominant(self) -> pd.DataFrame:
+        """Codominant encoding of genotypes.
+
+        See :meth:`GenotypeArray.encode_codominant`
+
+        Returns
+        -------
+        pd.DataFrame
+        """
+        return pd.concat(
+            [self._obj[col].genomics.encode_codominant() for col in self._obj.columns],
+            axis=1,
+        )
+
+    ###########
+    # Filters #
+    ###########
+    def filter_maf(self, keep_min_freq: Optional[float] = None) -> pd.DataFrame:
+        """
+        Drop variants with a MAF less than the specified value (0.01 by default)
+        """
+        if keep_min_freq is None:
+            keep_min_freq = 0.01
+        return self._obj.drop(
+            columns=[
+                c
+                for c in self._obj.columns
+                if self._obj[c].genomics.maf < keep_min_freq
+            ]
+        )
diff --git a/pandas_genomics/accessors.py → pandas_genomics/accessors/series_accessor.py b/pandas_genomics/accessors.py → pandas_genomics/accessors/series_accessor.py
@@ -3,8 +3,8 @@
 from pandas_genomics.arrays import GenotypeDtype
 
 
-@pd.api.extensions.register_series_accessor("genotype")
-class GenotypeAccessor:
+@pd.api.extensions.register_series_accessor("genomics")
+class GenotypeSeriesAccessor:
     """
     Series accessor for GenotypeArray methods
     """
@@ -26,14 +26,23 @@ def _wrap_method(self, method, *args, **kwargs):
     ####################
     @property
     def variant(self):
-        """Retrieve the variant
+        """Retrieve the variant object
 
         Returns
         -------
         variant: Variant
         """
         return self._array.variant
 
+    @property
+    def variant_info(self):
+        """Retrieve the variant as a pandas Series
+
+        Returns
+        -------
+        variant: pd.Series"""
+        return pd.Series(self._array.variant.as_dict(), name=self._name)
+
     #######################
     # Genotype Properties #
     #######################
@@ -44,6 +53,16 @@ def gt_scores(self):
         """
         return self._array.gt_scores
 
+    #########################
+    # Calculated Properties #
+    #########################
+    @property
+    def maf(self):
+        """Return the minor allele frequency
+
+        See :py:attr:`GenotypeArray.maf`"""
+        return self._array.maf
+
     ####################
     # In-place methods #
     ####################
@@ -125,3 +144,7 @@ def encode_codominant(self) -> pd.Series:
             index=self._index,
             name=f"{self._array.variant.id}_{self._array.variant.alleles[1]}",
         )
+
+    ##############
+    # QC Methods #
+    ##############
diff --git a/pandas_genomics/arrays/encoding_mixin.py b/pandas_genomics/arrays/encoding_mixin.py
@@ -0,0 +1,97 @@
+import numpy as np
+import pandas as pd
+
+from pandas_genomics.scalars import MISSING_IDX
+
+
+class EncodingMixin:
+    """
+    Genotype Mixin containing functions for performing encoding
+    """
+
+    def encode_additive(self) -> pd.arrays.IntegerArray:
+        """
+        Returns
+        -------
+        pd.arrays.IntegerArray
+            Number of copies of the minor allele
+            pd.NA when any alleles are missing
+            Raises ValueError if there is more than 1 alternate allele
+        """
+        # TODO: Return multiple arrays for multiple alternate alleles?
+        if len(self.variant.alleles) > 2:
+            raise ValueError("Additive encoding can only be used with one allele")
+
+        allele_sum = self.allele_idxs.sum(axis=1).astype("float")
+        allele_sum[(self.allele_idxs == MISSING_IDX).any(axis=1)] = np.nan
+        result = pd.array(data=allele_sum, dtype="UInt8")
+        return result
+
+    def encode_dominant(self) -> pd.arrays.IntegerArray:
+        """
+        Returns
+        -------
+        pd.arrays.IntegerArray
+            0 for no copies of the minor allele
+            1 for any copies of the minor allele
+            pd.NA when any alleles are missing
+            Raises an error if there is more than 1 alternate allele
+        """
+        # TODO: Return multiple arrays for multiple alternate alleles?
+        if len(self.variant.alleles) > 2:
+            raise ValueError("Dominant encoding can only be used with one allele")
+
+        has_minor = (self.allele_idxs == 1).any(axis=1).astype("float")
+        has_minor[(self.allele_idxs == MISSING_IDX).any(axis=1)] = np.nan
+        result = pd.array(data=has_minor, dtype="UInt8")
+        return result
+
+    def encode_recessive(self) -> pd.arrays.IntegerArray:
+        """
+        Returns
+        -------
+        pd.arrays.IntegerArray
+            1 for Homozygous Alt
+            0 for anything else
+            pd.NA when any alleles are missing
+            Raises an error if there is more than 1 alternate allele
+        """
+        # TODO: Return multiple arrays for multiple alternate alleles?
+        if len(self.variant.alleles) > 2:
+            raise ValueError("Recessive encoding can only be used with one allele")
+
+        all_minor = (self.allele_idxs == 1).all(axis=1).astype("float")
+        all_minor[(self.allele_idxs == MISSING_IDX).any(axis=1)] = np.nan
+        result = pd.array(data=all_minor, dtype="UInt8")
+        return result
+
+    def encode_codominant(self) -> pd.arrays.Categorical:
+        """
+        This encodes the genotype into three categories.  When utilized in regression, this results in two variables
+        due to dummy encoding- "Het" as 0 or 1 and "Hom" as 0 or 1.  0 in both indicates "Ref".
+
+        Returns
+        -------
+        pd.arrays.Categorical
+            'Ref' for Homozygous Reference
+            'Het' for Heterozygous
+            'Hom' for Homozygous Alt
+            pd.NA for missing
+            Raises an error if there is more than 1 alternate allele or ploidy is not 2
+        """
+        # TODO: Return multiple arrays for multiple alternate alleles?
+        if len(self.variant.alleles) > 2:
+            raise ValueError("Codominant encoding can only be used with one allele")
+        if self.dtype.variant.ploidy != 2:
+            raise ValueError(
+                "Codominant encoding can only be used with diploid genotypes"
+            )
+
+        allele_sum = self.allele_idxs.sum(axis=1)
+        categories = ["Ref", "Het", "Hom"]
+        result = pd.Categorical(
+            values=[categories[n] if n in {0, 1, 2} else None for n in allele_sum],
+            categories=categories,
+            ordered=True,
+        )
+        return result