diff --git a/clarite/modules/analyze/association_study.py b/clarite/modules/analyze/association_study.py index 7fca1d8..00fdef2 100644 --- a/clarite/modules/analyze/association_study.py +++ b/clarite/modules/analyze/association_study.py @@ -2,7 +2,6 @@ import click import pandas as pd -from pandas_genomics import GenotypeDtype from clarite.modules.analyze import regression from clarite.modules.analyze.regression import ( @@ -43,10 +42,6 @@ def association_study( This can be 'glm', 'weighted_glm', or 'r_survey' for built-in Regression types, or a custom subclass of Regression. If None, it is set to 'glm' if a survey design is not specified and 'weighted_glm' if it is. - encoding: str, default "additive" - Encoding method to use for any genotype data. One of {'additive', 'dominant', 'recessive', 'codominant', or 'weighted'} - edge_encoding_info: Optional pd.DataFrame, default None - If edge encoding is used, this must be provided. See Pandas-Genomics documentation on edge encodings. kwargs: Keyword arguments specific to the Regression being used Returns @@ -55,34 +50,6 @@ def association_study( Association Study results DataFrame with at least these columns: ['N', 'pvalue', 'error', 'warnings']. Indexed by the outcome variable and the variable being assessed in each regression """ - # Copy data to avoid modifying the original, in case it is changed - data = data.copy(deep=True) - - # Encode any genotype data - has_genotypes = False - for dt in data.dtypes: - if GenotypeDtype.is_dtype(dt): - has_genotypes = True - break - if has_genotypes: - if encoding == "additive": - data = data.genomics.encode_additive() - elif encoding == "dominant": - data = data.genomics.encode_dominant() - elif encoding == "recessive": - data = data.genomics.encode_recessive() - elif encoding == "codominant": - data = data.genomics.encode_codominant() - elif encoding == "edge": - if edge_encoding_info is None: - raise ValueError( - "'edge_encoding_info' must be provided when using edge encoding" - ) - else: - data = data.genomics.encode_edge(edge_encoding_info) - else: - raise ValueError(f"Genotypes provided with unknown 'encoding': {encoding}") - # Ensure outcome, covariates, and regression variables are lists if isinstance(outcomes, str): outcomes = [ diff --git a/clarite/modules/analyze/interaction_study.py b/clarite/modules/analyze/interaction_study.py index 74350ea..caf6eed 100644 --- a/clarite/modules/analyze/interaction_study.py +++ b/clarite/modules/analyze/interaction_study.py @@ -16,7 +16,7 @@ def interaction_study( edge_encoding_info: Optional[pd.DataFrame] = None, report_betas: bool = False, min_n: int = 200, - process_num: Optional[int] = None + process_num: Optional[int] = None, ): """Perform LRT tests comparing a model with interaction terms to one without. @@ -110,7 +110,7 @@ def interaction_study( min_n=min_n, interactions=interactions, report_betas=report_betas, - process_num=process_num + process_num=process_num, ) print(regression) diff --git a/clarite/modules/analyze/regression/base.py b/clarite/modules/analyze/regression/base.py index 3b1b7d4..e0f71be 100644 --- a/clarite/modules/analyze/regression/base.py +++ b/clarite/modules/analyze/regression/base.py @@ -39,6 +39,8 @@ def __init__( regression_variables: List[str], covariates: Optional[List[str]] = None, ): + # Copy the data to avoid changing the original. The copy will be modified in-place. + data = data.copy() # Print a warning if there are any empty categories and remove them # This is done to distinguish from those that become missing during analysis (and could be an issue) empty_categories = _remove_empty_categories(data) @@ -105,7 +107,7 @@ def _validate_regression_params(self, regression_variables): types = _get_dtypes(self.data) rv_types = {v: t for v, t in types.iteritems() if v in regression_variables} rv_count = 0 - for dtype in ["binary", "categorical", "continuous"]: + for dtype in ["binary", "categorical", "continuous", "genotypes"]: self.regression_variables[dtype] = [ v for v, t in rv_types.items() if t == dtype ] diff --git a/clarite/modules/analyze/regression/glm_regression.py b/clarite/modules/analyze/regression/glm_regression.py index f3f3976..c681d74 100644 --- a/clarite/modules/analyze/regression/glm_regression.py +++ b/clarite/modules/analyze/regression/glm_regression.py @@ -9,9 +9,10 @@ import patsy import scipy import statsmodels.api as sm +from pandas_genomics import GenotypeDtype from scipy.stats import stats -from clarite.internal.utilities import _remove_empty_categories +from clarite.internal.utilities import _remove_empty_categories, _get_dtypes from .base import Regression from ..utils import fix_names, statsmodels_var_regex @@ -59,10 +60,16 @@ class GLMRegression(Regression): False by default. If True, numeric data will be standardized using z-scores before regression. This will affect the beta values and standard error, but not the pvalues. + encoding: str, default "additive" + Encoding method to use for any genotype data. One of {'additive', 'dominant', 'recessive', 'codominant', or 'weighted'} + edge_encoding_info: Optional pd.DataFrame, default None + If edge encoding is used, this must be provided. See Pandas-Genomics documentation on edge encodings. process_num: Optional[int] Number of processes to use when running the analysis, default is None (use the number of cores) """ + KNOWN_ENCODINGS = {"additive", "dominant", "recessive", "codominant", "edge"} + def __init__( self, data: pd.DataFrame, @@ -72,6 +79,8 @@ def __init__( min_n: int = 200, report_categorical_betas: bool = False, standardize_data: bool = False, + encoding: str = "additive", + edge_encoding_info: Optional[pd.DataFrame] = None, process_num: Optional[int] = None, ): # base class init @@ -91,6 +100,15 @@ def __init__( if process_num is None: process_num = multiprocessing.cpu_count() self.process_num = process_num + if encoding not in self.KNOWN_ENCODINGS: + raise ValueError(f"Genotypes provided with unknown 'encoding': {encoding}") + elif encoding == "edge" and edge_encoding_info is None: + raise ValueError( + "'edge_encoding_info' must be provided when using edge encoding" + ) + else: + self.encoding = encoding + self.edge_encoding_info = edge_encoding_info # Ensure the data output type is compatible # Set 'self.family' and 'self.use_t' which are dependent on the outcome dtype @@ -316,6 +334,28 @@ def _run_categorical( "Diff_AIC": est.aic - est_restricted.aic, } + def _get_rv_specific_data(self, rv: str): + """Select the data relevant to performing a regression on a given variable, encoding genotypes if needed""" + data = self.data[[rv, self.outcome_variable] + self.covariates].copy() + # Encode any genotype data + has_genotypes = False + for dt in data.dtypes: + if GenotypeDtype.is_dtype(dt): + has_genotypes = True + break + if has_genotypes: + if self.encoding == "additive": + data = data.genomics.encode_additive() + elif self.encoding == "dominant": + data = data.genomics.encode_dominant() + elif self.encoding == "recessive": + data = data.genomics.encode_recessive() + elif self.encoding == "codominant": + data = data.genomics.encode_codominant() + elif self.encoding == "edge": + data = data.genomics.encode_edge(self.edge_encoding_info) + return data + def run(self): """Run a regression object, returning the results and logging any warnings/errors""" for rv_type, rv_list in self.regression_variables.items(): @@ -330,24 +370,37 @@ def run(self): ) ) - with multiprocessing.Pool(processes=self.process_num) as pool: - run_result = pool.starmap( - self._run_rv, - zip( - rv_list, - repeat(rv_type), - [ - self.data[[rv, self.outcome_variable] + self.covariates] - for rv in rv_list - ], - repeat(self.outcome_variable), - repeat(self.covariates), - repeat(self.min_n), - repeat(self.family), - repeat(self.use_t), - repeat(self.report_categorical_betas), - ), - ) + if self.process_num == 1: + run_result = [ + self._run_rv( + rv, + rv_type, + self._get_rv_specific_data(rv), + self.outcome_variable, + self.covariates, + self.min_n, + self.family, + self.use_t, + self.report_categorical_betas, + ) + for rv in rv_list + ] + else: + with multiprocessing.Pool(processes=self.process_num) as pool: + run_result = pool.starmap( + self._run_rv, + zip( + rv_list, + repeat(rv_type), + [self._get_rv_specific_data(rv) for rv in rv_list], + repeat(self.outcome_variable), + repeat(self.covariates), + repeat(self.min_n), + repeat(self.family), + repeat(self.use_t), + repeat(self.report_categorical_betas), + ), + ) for rv, rv_result in zip(rv_list, run_result): results, warnings, error = rv_result @@ -424,6 +477,11 @@ def _run_rv( # Apply the complete_case_mask to the data to ensure categorical models use the same data in the LRT data = data.loc[complete_case_mask] + # Update rv_type to the encoded type if it is a genotype + if rv_type == "genotypes": + """Need to update with encoded type""" + rv_type = _get_dtypes(data[rv])[rv] + # Run Regression if rv_type == "continuous": result = cls.get_default_result_dict(rv) diff --git a/clarite/modules/analyze/regression/interaction_regression.py b/clarite/modules/analyze/regression/interaction_regression.py index f4c7421..529ce6f 100644 --- a/clarite/modules/analyze/regression/interaction_regression.py +++ b/clarite/modules/analyze/regression/interaction_regression.py @@ -8,6 +8,7 @@ import patsy import scipy import statsmodels.api as sm +from pandas_genomics import GenotypeDtype from clarite.internal.utilities import _remove_empty_categories from . import GLMRegression @@ -45,6 +46,10 @@ class InteractionRegression(GLMRegression): False by default. If True, the results will contain one row for each interaction term and will include the beta value for that term. The number of terms increases with the number of categories in each interacting term. + encoding: str, default "additive" + Encoding method to use for any genotype data. One of {'additive', 'dominant', 'recessive', 'codominant', or 'weighted'} + edge_encoding_info: Optional pd.DataFrame, default None + If edge encoding is used, this must be provided. See Pandas-Genomics documentation on edge encodings. process_num: Optional[int] Number of processes to use when running the analysis, default is None (use the number of cores) @@ -58,6 +63,8 @@ def __init__( min_n=200, interactions=None, report_betas=False, + encoding: str = "additive", + edge_encoding_info: Optional[pd.DataFrame] = None, process_num: Optional[int] = None, ): # base class init @@ -71,6 +78,8 @@ def __init__( outcome_variable=outcome_variable, covariates=covariates, regression_variables=regression_variables, + encoding=encoding, + edge_encoding_info=edge_encoding_info, min_n=min_n, ) @@ -86,11 +95,10 @@ def __init__( def _process_interactions(self, interactions): """Validate the interactions parameter and save it as a list of string tuples""" - regression_var_list = ( - self.regression_variables["binary"] - + self.regression_variables["categorical"] - + self.regression_variables["continuous"] - ) + regression_var_list = [] + for var_list in self.regression_variables.values(): + regression_var_list.extend(var_list) + if len(regression_var_list) < 2: raise ValueError( f"Not enough valid variables for running interactions: {len(regression_var_list)} variables" @@ -212,6 +220,35 @@ def _run_interaction_regression( # Did not converge - nothing to update yield dict() + def _get_interaction_specific_data(self, interaction: Tuple[str, str]): + """Select the data relevant to performing a regression on a given interaction, encoding genotypes if needed""" + data = self.data[ + list(interaction) + + [ + self.outcome_variable, + ] + + self.covariates + ].copy() + + # Encode any genotype data + has_genotypes = False + for dt in data.dtypes: + if GenotypeDtype.is_dtype(dt): + has_genotypes = True + break + if has_genotypes: + if self.encoding == "additive": + data = data.genomics.encode_additive() + elif self.encoding == "dominant": + data = data.genomics.encode_dominant() + elif self.encoding == "recessive": + data = data.genomics.encode_recessive() + elif self.encoding == "codominant": + data = data.genomics.encode_codominant() + elif self.encoding == "edge": + data = data.genomics.encode_edge(self.edge_encoding_info) + return data + def run(self): """Run a regression object, returning the results and logging any warnings/errors""" # Log how many interactions are being run using how many processes @@ -221,29 +258,40 @@ def run(self): fg="green", ) ) - with multiprocessing.Pool(processes=self.process_num) as pool: - run_result = pool.starmap( - self._run_interaction, - zip( - self.interactions, - [ - self.data[ - list(interaction) - + [ - self.outcome_variable, - ] - + self.covariates - ] - for interaction in self.interactions - ], - repeat(self.outcome_variable), - repeat(self.covariates), - repeat(self.min_n), - repeat(self.family), - repeat(self.use_t), - repeat(self.report_betas), - ), - ) + + if self.process_num == 1: + run_result = [ + self._run_interaction( + interaction, + self._get_interaction_specific_data(interaction), + self.outcome_variable, + self.covariates, + self.min_n, + self.family, + self.use_t, + self.report_betas, + ) + for interaction in self.interactions + ] + + else: + with multiprocessing.Pool(processes=self.process_num) as pool: + run_result = pool.starmap( + self._run_interaction, + zip( + self.interactions, + [ + self._get_interaction_specific_data(interaction) + for interaction in self.interactions + ], + repeat(self.outcome_variable), + repeat(self.covariates), + repeat(self.min_n), + repeat(self.family), + repeat(self.use_t), + repeat(self.report_betas), + ), + ) for interaction, interaction_result in zip(self.interactions, run_result): interaction_str = ":".join(interaction) diff --git a/clarite/modules/analyze/regression/r_survey_regression.py b/clarite/modules/analyze/regression/r_survey_regression.py index 11374de..e5de266 100644 --- a/clarite/modules/analyze/regression/r_survey_regression.py +++ b/clarite/modules/analyze/regression/r_survey_regression.py @@ -61,6 +61,10 @@ def __init__( covariates=covariates, ) + # Raise an error if any genotypes are present since they are unsupported + if len(self.regression_variables.get("genotypes", [])) > 0: + raise ValueError("Genotypes are not supported in RSurveyRegression") + # Custom init involving kwargs passed to this regression self.min_n = min_n self.survey_design_spec = survey_design_spec diff --git a/clarite/modules/analyze/regression/weighted_glm_regression.py b/clarite/modules/analyze/regression/weighted_glm_regression.py index 978821e..183f6a1 100644 --- a/clarite/modules/analyze/regression/weighted_glm_regression.py +++ b/clarite/modules/analyze/regression/weighted_glm_regression.py @@ -13,7 +13,7 @@ from .glm_regression import GLMRegression from clarite.modules.survey import SurveyDesignSpec, SurveyModel from clarite.internal.calculations import regTermTest -from clarite.internal.utilities import _remove_empty_categories +from clarite.internal.utilities import _remove_empty_categories, _get_dtypes from ..utils import statsmodels_var_regex, fix_names @@ -67,6 +67,10 @@ class WeightedGLMRegression(GLMRegression): False by default. If True, numeric data will be standardized using z-scores before regression. This will affect the beta values and standard error, but not the pvalues. + encoding: str, default "additive" + Encoding method to use for any genotype data. One of {'additive', 'dominant', 'recessive', 'codominant', or 'weighted'} + edge_encoding_info: Optional pd.DataFrame, default None + If edge encoding is used, this must be provided. See Pandas-Genomics documentation on edge encodings. process_num: Optional[int] Number of processes to use when running the analysis, default is None (use the number of cores) """ @@ -81,6 +85,8 @@ def __init__( min_n: int = 200, report_categorical_betas: bool = False, standardize_data: bool = False, + encoding: str = "additive", + edge_encoding_info: Optional[pd.DataFrame] = None, process_num: Optional[int] = None, ): # survey_design_spec should actually not be None, but is a keyword for convenience @@ -96,6 +102,8 @@ def __init__( min_n=min_n, report_categorical_betas=report_categorical_betas, standardize_data=standardize_data, + encoding=encoding, + edge_encoding_info=edge_encoding_info, process_num=process_num, ) @@ -289,29 +297,40 @@ def run(self): ) ) - with multiprocessing.Pool(processes=self.process_num) as pool: - run_result = pool.starmap( - self._run_weighted_rv, - zip( - rv_list, - repeat(rv_type), - [ - self.data[[rv, ov] + cvs] - for rv, ov, cvs in zip( - rv_list, - repeat(self.outcome_variable), - repeat(self.covariates), - ) - ], - repeat(self.outcome_variable), - repeat(self.covariates), - repeat(self.survey_design_spec), - repeat(self.min_n), - repeat(self.family), - repeat(self.use_t), - repeat(self.report_categorical_betas), - ), - ) + if self.process_num == 1: + run_result = [ + self._run_weighted_rv( + rv, + rv_type, + self._get_rv_specific_data(rv), + self.outcome_variable, + self.covariates, + self.survey_design_spec, + self.min_n, + self.family, + self.use_t, + self.report_categorical_betas, + ) + for rv in rv_list + ] + + else: + with multiprocessing.Pool(processes=self.process_num) as pool: + run_result = pool.starmap( + self._run_weighted_rv, + zip( + rv_list, + repeat(rv_type), + [self._get_rv_specific_data(rv) for rv in rv_list], + repeat(self.outcome_variable), + repeat(self.covariates), + repeat(self.survey_design_spec), + repeat(self.min_n), + repeat(self.family), + repeat(self.use_t), + repeat(self.report_categorical_betas), + ), + ) for rv, rv_result in zip(rv_list, run_result): results, warnings, error = rv_result @@ -411,6 +430,11 @@ def _run_weighted_rv( # Full Formula, adding the regression variable to the restricted formula formula = formula_restricted + f" + Q('{rv}" "')" + # Update rv_type to the encoded type if it is a genotype + if rv_type == "genotypes": + """Need to update with encoded type""" + rv_type = _get_dtypes(data[rv])[rv] + # Run Regression if rv_type == "continuous": result = cls.get_default_result_dict(rv) diff --git a/docs/source/release-history.rst b/docs/source/release-history.rst index 86bc164..a159b4f 100644 --- a/docs/source/release-history.rst +++ b/docs/source/release-history.rst @@ -2,6 +2,11 @@ Release History =============== +v2.2.0 (2021-07-19) +------------------- + +Change the analysis so that encoding is performed later (to save memory usage) + v2.1.1 (2021-07-16) ------------------- diff --git a/pyproject.toml b/pyproject.toml index af6fd5f..c34af76 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "clarite" -version = "2.1.1" +version = "2.2.0" description = "CLeaning to Analysis: Reproducibility-based Interface for Traits and Exposures" authors = ["Hall Lab "] license = "BSD-3-Clause" @@ -33,7 +33,7 @@ sphinx-copybutton = {version = "^0.3.0", optional = true} ipython = {version = "^7.18.1", optional = true} sphinx-click = {version = "^2.6", optional = true} importlib-metadata = {version = "^2.0", python = "<3.8"} -pandas-genomics = ">=0.10" +pandas-genomics = ">=0.10.1" [tool.poetry.dev-dependencies] codecov = "^2.1.11" diff --git a/tests/analyze/conftest.py b/tests/analyze/conftest.py index 527d6f1..1708c0f 100644 --- a/tests/analyze/conftest.py +++ b/tests/analyze/conftest.py @@ -108,7 +108,7 @@ def large_gwas_data(): n=10000, random_seed=i, ) - for i in range(2, 1000) + for i in range(3, 1001) } ) genotypes = pd.concat([genotypes, random_genotypes], axis=1) diff --git a/tests/analyze/test_gwas.py b/tests/analyze/test_gwas.py index 44b0068..21f0b3c 100644 --- a/tests/analyze/test_gwas.py +++ b/tests/analyze/test_gwas.py @@ -1,6 +1,10 @@ import pytest +import pandas as pd +import numpy as np + import clarite +from clarite.modules.survey import SurveyDesignSpec def test_bams_main(genotype_case_control_add_add_main): @@ -28,8 +32,8 @@ def test_bams_interaction(genotype_case_control_rec_rec_onlyinteraction): @pytest.mark.slow @pytest.mark.parametrize("process_num", [None, 1]) -def test_large_gwas(large_gwas_data, process_num): - """10k samples with 1k SNPs""" +def test_largeish_gwas(large_gwas_data, process_num): + """10k samples with 1000 SNPs""" # Run CLARITE GWAS results = clarite.analyze.association_study( data=large_gwas_data, @@ -37,4 +41,25 @@ def test_large_gwas(large_gwas_data, process_num): encoding="additive", process_num=process_num, ) - print(results.head()) + # Run CLARITE GWAS with fake (all ones) weights to confirm the weighted regression handles genotypes correctly + results_weighted = clarite.analyze.association_study( + data=large_gwas_data, + outcomes="Outcome", + encoding="additive", + process_num=process_num, + survey_design_spec=SurveyDesignSpec( + survey_df=pd.DataFrame({"weights": np.ones(len(large_gwas_data))}), + weights="weights", + ), + ) + # TODO: Add useful asserts rather than just making sure it runs + + +@pytest.mark.xfail(strict=True) +def test_gwas_r(large_gwas_data): + clarite.analyze.association_study( + data=large_gwas_data, + outcomes="Outcome", + encoding="additive", + survey_kind="r_survey", + )