-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #8 from jrm5100/5-df-accessor
5 df accessor
- Loading branch information
Showing
15 changed files
with
446 additions
and
118 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from .series_accessor import GenotypeSeriesAccessor | ||
from .dataframe_accessor import GenotypeDataframeAccessor |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
from typing import Optional | ||
|
||
import pandas as pd | ||
|
||
from pandas_genomics.arrays import GenotypeDtype | ||
|
||
|
||
@pd.api.extensions.register_dataframe_accessor("genomics") | ||
class GenotypeDataframeAccessor: | ||
""" | ||
DataFrame accessor for GenotypeArray methods | ||
""" | ||
|
||
def __init__(self, pandas_obj): | ||
for colname in pandas_obj.columns: | ||
if not GenotypeDtype.is_dtype(pandas_obj[colname].values.dtype): | ||
raise AttributeError( | ||
f"Incompatible datatype: column {colname} is '{pandas_obj[colname].dtype}'," | ||
f" but must be a GenotypeDtype" | ||
) | ||
self._obj = pandas_obj | ||
|
||
###################### | ||
# Variant Properties # | ||
###################### | ||
@property | ||
def variant_info(self) -> pd.DataFrame: | ||
"""Return a DataFrame with variant info indexed by the column name""" | ||
return pd.DataFrame.from_dict( | ||
{ | ||
colname: self._obj[colname].genomics.variant_info | ||
for colname in self._obj.columns | ||
}, | ||
orient="index", | ||
) | ||
|
||
######################### | ||
# Calculated Properties # | ||
######################### | ||
@property | ||
def maf(self): | ||
"""Return the minor allele frequency | ||
See :py:attr:`GenotypeArray.maf`""" | ||
return pd.Series( | ||
{col: self._obj[col].genomics.maf for col in self._obj.columns} | ||
) | ||
|
||
############ | ||
# Encoding # | ||
############ | ||
def encode_additive(self) -> pd.DataFrame: | ||
"""Additive encoding of genotypes. | ||
See :meth:`GenotypeArray.encode_additive` | ||
Returns | ||
------- | ||
pd.DataFrame | ||
""" | ||
return pd.concat( | ||
[self._obj[col].genomics.encode_additive() for col in self._obj.columns], | ||
axis=1, | ||
) | ||
|
||
def encode_dominant(self) -> pd.DataFrame: | ||
"""Dominant encoding of genotypes. | ||
See :meth:`GenotypeArray.encode_dominant` | ||
Returns | ||
------- | ||
pd.DataFrame | ||
""" | ||
return pd.concat( | ||
[self._obj[col].genomics.encode_dominant() for col in self._obj.columns], | ||
axis=1, | ||
) | ||
|
||
def encode_recessive(self) -> pd.DataFrame: | ||
"""Recessive encoding of genotypes. | ||
See :meth:`GenotypeArray.encode_recessive` | ||
Returns | ||
------- | ||
pd.DataFrame | ||
""" | ||
return pd.concat( | ||
[self._obj[col].genomics.encode_recessive() for col in self._obj.columns], | ||
axis=1, | ||
) | ||
|
||
def encode_codominant(self) -> pd.DataFrame: | ||
"""Codominant encoding of genotypes. | ||
See :meth:`GenotypeArray.encode_codominant` | ||
Returns | ||
------- | ||
pd.DataFrame | ||
""" | ||
return pd.concat( | ||
[self._obj[col].genomics.encode_codominant() for col in self._obj.columns], | ||
axis=1, | ||
) | ||
|
||
########### | ||
# Filters # | ||
########### | ||
def filter_maf(self, keep_min_freq: Optional[float] = None) -> pd.DataFrame: | ||
""" | ||
Drop variants with a MAF less than the specified value (0.01 by default) | ||
""" | ||
if keep_min_freq is None: | ||
keep_min_freq = 0.01 | ||
return self._obj.drop( | ||
columns=[ | ||
c | ||
for c in self._obj.columns | ||
if self._obj[c].genomics.maf < keep_min_freq | ||
] | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
import numpy as np | ||
import pandas as pd | ||
|
||
from pandas_genomics.scalars import MISSING_IDX | ||
|
||
|
||
class EncodingMixin: | ||
""" | ||
Genotype Mixin containing functions for performing encoding | ||
""" | ||
|
||
def encode_additive(self) -> pd.arrays.IntegerArray: | ||
""" | ||
Returns | ||
------- | ||
pd.arrays.IntegerArray | ||
Number of copies of the minor allele | ||
pd.NA when any alleles are missing | ||
Raises ValueError if there is more than 1 alternate allele | ||
""" | ||
# TODO: Return multiple arrays for multiple alternate alleles? | ||
if len(self.variant.alleles) > 2: | ||
raise ValueError("Additive encoding can only be used with one allele") | ||
|
||
allele_sum = self.allele_idxs.sum(axis=1).astype("float") | ||
allele_sum[(self.allele_idxs == MISSING_IDX).any(axis=1)] = np.nan | ||
result = pd.array(data=allele_sum, dtype="UInt8") | ||
return result | ||
|
||
def encode_dominant(self) -> pd.arrays.IntegerArray: | ||
""" | ||
Returns | ||
------- | ||
pd.arrays.IntegerArray | ||
0 for no copies of the minor allele | ||
1 for any copies of the minor allele | ||
pd.NA when any alleles are missing | ||
Raises an error if there is more than 1 alternate allele | ||
""" | ||
# TODO: Return multiple arrays for multiple alternate alleles? | ||
if len(self.variant.alleles) > 2: | ||
raise ValueError("Dominant encoding can only be used with one allele") | ||
|
||
has_minor = (self.allele_idxs == 1).any(axis=1).astype("float") | ||
has_minor[(self.allele_idxs == MISSING_IDX).any(axis=1)] = np.nan | ||
result = pd.array(data=has_minor, dtype="UInt8") | ||
return result | ||
|
||
def encode_recessive(self) -> pd.arrays.IntegerArray: | ||
""" | ||
Returns | ||
------- | ||
pd.arrays.IntegerArray | ||
1 for Homozygous Alt | ||
0 for anything else | ||
pd.NA when any alleles are missing | ||
Raises an error if there is more than 1 alternate allele | ||
""" | ||
# TODO: Return multiple arrays for multiple alternate alleles? | ||
if len(self.variant.alleles) > 2: | ||
raise ValueError("Recessive encoding can only be used with one allele") | ||
|
||
all_minor = (self.allele_idxs == 1).all(axis=1).astype("float") | ||
all_minor[(self.allele_idxs == MISSING_IDX).any(axis=1)] = np.nan | ||
result = pd.array(data=all_minor, dtype="UInt8") | ||
return result | ||
|
||
def encode_codominant(self) -> pd.arrays.Categorical: | ||
""" | ||
This encodes the genotype into three categories. When utilized in regression, this results in two variables | ||
due to dummy encoding- "Het" as 0 or 1 and "Hom" as 0 or 1. 0 in both indicates "Ref". | ||
Returns | ||
------- | ||
pd.arrays.Categorical | ||
'Ref' for Homozygous Reference | ||
'Het' for Heterozygous | ||
'Hom' for Homozygous Alt | ||
pd.NA for missing | ||
Raises an error if there is more than 1 alternate allele or ploidy is not 2 | ||
""" | ||
# TODO: Return multiple arrays for multiple alternate alleles? | ||
if len(self.variant.alleles) > 2: | ||
raise ValueError("Codominant encoding can only be used with one allele") | ||
if self.dtype.variant.ploidy != 2: | ||
raise ValueError( | ||
"Codominant encoding can only be used with diploid genotypes" | ||
) | ||
|
||
allele_sum = self.allele_idxs.sum(axis=1) | ||
categories = ["Ref", "Het", "Hom"] | ||
result = pd.Categorical( | ||
values=[categories[n] if n in {0, 1, 2} else None for n in allele_sum], | ||
categories=categories, | ||
ordered=True, | ||
) | ||
return result |
Oops, something went wrong.