Release 2.3.1

HallLab · Feb 8, 2023 · 98f78ee · 98f78ee
1 parent 33cabbb
commit 98f78ee
Show file tree

Hide file tree

Showing 8 changed files with 39 additions and 26 deletions.
diff --git a/clarite/internal/utilities.py b/clarite/internal/utilities.py
@@ -1,6 +1,6 @@
 from functools import wraps
 from importlib.util import find_spec
-from typing import Optional, List, Union
+from typing import List, Optional, Union
 
 import click
 import pandas as pd
@@ -208,7 +208,10 @@ def _remove_empty_categories(
             existing_cats = data[var].cat.categories
             if data[var].cat.ordered:
                 print()
-            data[var] = data[var].cat.remove_unused_categories()
+            # GITHUB ISSUE #120: SettingWithCopyWarning on Regression runs
+            # data[var] = data[var].cat.remove_unused_categories()
+            data.loc[:, var] = data[var].cat.remove_unused_categories()
+
             removed_categories = set(existing_cats) - set(data[var].cat.categories)
             if len(removed_categories) > 0:
                 removed_cats[var] = removed_categories

diff --git a/clarite/modules/analyze/regression/glm_regression.py b/clarite/modules/analyze/regression/glm_regression.py
@@ -17,6 +17,9 @@
 from ..utils import fix_names, statsmodels_var_regex
 from .base import Regression
 
+# GITHUB ISSUE #119: Regressions with Error after Multiprocessing release python > 3.8
+multiprocessing.get_start_method("fork")
+
 
 class GLMRegression(Regression):
     """
@@ -127,7 +130,7 @@ def __init__(
             counts = self.data[self.outcome_variable].value_counts().to_dict()
 
             categories = self.data[self.outcome_variable].cat.categories
-            # GITHUB/ISSUES 115: Keep control as 0 and case as 1
+            # GITHUB ISSUES #115: Keep control as 0 and case as 1
             if categories[0] == "Case" and categories[1] == "Control":
                 categories = sorted(categories, reverse=True)
 

diff --git a/clarite/modules/analyze/regression/interaction_regression.py b/clarite/modules/analyze/regression/interaction_regression.py
@@ -15,6 +15,9 @@
 from ..utils import fix_names
 from . import GLMRegression
 
+# GITHUB ISSUE #119: Regressions with Error after Multiprocessing release python > 3.8
+multiprocessing.get_start_method("fork")
+
 
 class InteractionRegression(GLMRegression):
     """

diff --git a/clarite/modules/analyze/regression/weighted_glm_regression.py b/clarite/modules/analyze/regression/weighted_glm_regression.py
@@ -1,20 +1,24 @@
 import multiprocessing
 import re
 from itertools import repeat
-from typing import Optional, Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import click
 import numpy as np
+import pandas as pd
 import patsy
 import scipy
-import pandas as pd
 import statsmodels.api as sm
 
-from .glm_regression import GLMRegression
-from clarite.modules.survey import SurveyDesignSpec, SurveyModel
 from clarite.internal.calculations import regTermTest
-from clarite.internal.utilities import _remove_empty_categories, _get_dtypes
-from ..utils import statsmodels_var_regex, fix_names
+from clarite.internal.utilities import _get_dtypes, _remove_empty_categories
+from clarite.modules.survey import SurveyDesignSpec, SurveyModel
+
+from ..utils import fix_names, statsmodels_var_regex
+from .glm_regression import GLMRegression
+
+# GITHUB ISSUE #119: Regressions with Error after Multiprocessing release python > 3.8
+multiprocessing.get_start_method("fork")
 
 
 class WeightedGLMRegression(GLMRegression):
@@ -385,7 +389,9 @@ def _run_weighted_rv(
                 ~data[[rv, outcome_variable] + covariates].isna().any(axis=1)
             )
             # If allowed (an error hasn't been raised) negate missing_weight_mask so True=keep to drop those
-            complete_case_mask = complete_case_mask & ~missing_weight_mask
+            # GITHUB ISSUE #117: Error type variable on Weight Regression with Clusters
+            if missing_weight_mask is not None:
+                complete_case_mask = complete_case_mask & ~missing_weight_mask
 
             # Count restricted rows
             restricted_rows = survey_design_spec.subset_array & complete_case_mask

diff --git a/clarite/modules/survey/survey_design.py b/clarite/modules/survey/survey_design.py
@@ -1,4 +1,4 @@
-from typing import Optional, Union, Dict, Tuple
+from typing import Dict, Optional, Tuple, Union
 
 import click
 import numpy as np
@@ -605,6 +605,12 @@ def get_survey_design(self, regression_variable, complete_case_idx):
             self.cluster_values.loc[self.subset_array],
         )
         has_weights, weight_name, weight_values = self.get_weights(regression_variable)
+        # GITHUB ISSUE #118: Function self.get_weights(regression_variable) return None
+        if not has_weights:
+            has_weights, weight_values = (
+                False,
+                self.weight_values.loc[self.subset_array],
+            )
 
         # Filter out any incomplete cases
         strata_values = strata_values.loc[complete_case_idx]

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "clarite"
-version = "2.3.0"
+version = "2.3.1"
 description = "CLeaning to Analysis: Reproducibility-based Interface for Traits and Exposures"
 authors = ["Andre Rico <alr6366@psu.edu>"]
 license = "BSD-3-Clause"

diff --git a/tests/analyze/test_association_study.py b/tests/analyze/test_association_study.py
@@ -996,22 +996,13 @@ def test_edge_encondig_logistic_regression():
         interaction=0,
         random_seed=2021,
     )
-    test = sim.BAMS.from_model(
-        eff1=sim.SNPEffectEncodings.ADDITIVE,
-        eff2=sim.SNPEffectEncodings.ADDITIVE,
-        penetrance_base=0.45,
-        main1=1,
-        main2=0,
-        interaction=0,
-    )
     train_add = train.generate_case_control(n_cases=5000, n_controls=5000)
-    test_add = test.generate_case_control(n_cases=5000, n_controls=5000)
     edge_weights = train_add.genomics.calculate_edge_encoding_values(
         data=train_add["Outcome"], outcome_variable="Outcome"
     )
 
     edge_results = clarite.analyze.association_study(
-        data=test_add,
+        data=train_add,
         outcomes="Outcome",
         encoding="edge",
         edge_encoding_info=edge_weights,

diff --git a/tests/analyze/test_gwas.py b/tests/analyze/test_gwas.py
@@ -1,7 +1,6 @@
-import pytest
-
-import pandas as pd
 import numpy as np
+import pandas as pd
+import pytest
 
 import clarite
 from clarite.modules.survey import SurveyDesignSpec
@@ -30,7 +29,7 @@ def test_bams_interaction(genotype_case_control_rec_rec_onlyinteraction):
     assert result_interaction.loc[("SNP1", "SNP2", "Outcome"), "LRT_pvalue"] <= 1e-5
 
 
-@pytest.mark.slow
+# @pytest.mark.slow
 @pytest.mark.parametrize("process_num", [None, 1])
 def test_largeish_gwas(large_gwas_data, process_num):
     """10k samples with 1000 SNPs"""
@@ -52,6 +51,8 @@ def test_largeish_gwas(large_gwas_data, process_num):
             weights="weights",
         ),
     )
+    assert results == results
+    assert results_weighted == results_weighted
     # TODO: Add useful asserts rather than just making sure it runs