Version 2.3.5

HallLab · Oct 10, 2023 · a493334 · a493334
1 parent 817ccad
commit a493334
Show file tree

Hide file tree

Showing 28 changed files with 18,617 additions and 219 deletions.
diff --git a/clarite/cli/commands/analyze.py b/clarite/cli/commands/analyze.py
@@ -319,8 +319,6 @@ def get_significant(ewas_result, output, use_fdr, pvalue):
     else:
         col = "pvalue_bonferroni"
     _, data = ewas_result
-    data = data.loc[
-        data[col] <= pvalue,
-    ]
+    data = data.loc[data[col] <= pvalue,]
     # Save result
     save_clarite_ewas(data, output)
diff --git a/clarite/cli/commands/describe.py b/clarite/cli/commands/describe.py
@@ -38,9 +38,7 @@ def freq_table(data, output):
     # Save results
     results.to_csv(output, sep="\t", index=False)
     # Log
-    processed = results.loc[
-        results["value"] != "<Non-Categorical Values>",
-    ]
+    processed = results.loc[results["value"] != "<Non-Categorical Values>",]
     if len(processed) > 0:
         num_values = processed[["Variable", "value"]].nunique()
         num_variables = processed["Variable"].nunique()

diff --git a/clarite/cli/commands/modify.py b/clarite/cli/commands/modify.py
@@ -242,25 +242,15 @@ def rowfilter(data, output, column, vs, vi, vf, comparison):
         value = values[0]
     # Filter
     if comparison == "lt":
-        data.df = data.df.loc[
-            data.df[column] < value,
-        ]
+        data.df = data.df.loc[data.df[column] < value,]
     elif comparison == "lte":
-        data.df = data.df.loc[
-            data.df[column] <= value,
-        ]
+        data.df = data.df.loc[data.df[column] <= value,]
     elif comparison == "eq":
-        data.df = data.df.loc[
-            data.df[column] == value,
-        ]
+        data.df = data.df.loc[data.df[column] == value,]
     elif comparison == "gt":
-        data.df = data.df.loc[
-            data.df[column] >= value,
-        ]
+        data.df = data.df.loc[data.df[column] >= value,]
     elif comparison == "gte":
-        data.df = data.df.loc[
-            data.df[column] > value,
-        ]
+        data.df = data.df.loc[data.df[column] > value,]
     # Save
     save_clarite_data(data, output)
 

diff --git a/clarite/cli/commands/plot.py b/clarite/cli/commands/plot.py
@@ -116,7 +116,7 @@ def manhattan(ewas_result, output, categories, bonferroni, fdr, other, nlabeled,
     # Load data
     name, data = ewas_result
     data_dict = {name: data}
-    for (name, data) in other:
+    for name, data in other:
         data_dict[name] = data
     # Load categories, if any
     if categories is not None:
@@ -187,7 +187,7 @@ def manhattan_bonferroni(
     # Load data
     name, data = ewas_result
     data_dict = {name: data}
-    for (name, data) in other:
+    for name, data in other:
         data_dict[name] = data
     # Load categories, if any
     if categories is not None:
@@ -255,7 +255,7 @@ def manhattan_fdr(ewas_result, output, categories, cutoff, other, nlabeled, labe
     # Load data
     name, data = ewas_result
     data_dict = {name: data}
-    for (name, data) in other:
+    for name, data in other:
         data_dict[name] = data
     # Load categories, if any
     if categories is not None:

diff --git a/clarite/internal/utilities.py b/clarite/internal/utilities.py
@@ -25,6 +25,7 @@ def wrapped(*args, **kwargs):
 
 def requires(package_name):
     """Decorator factory to ensure optional packages are imported before running"""
+
     # Define and return an appropriate decorator
     def decorator(func):
         # Check if package is importable

diff --git a/clarite/modules/analyze/interaction_study.py b/clarite/modules/analyze/interaction_study.py
@@ -1,7 +1,7 @@
 from typing import List, Optional, Tuple, Union
 
-import pandas as pd
 import click
+import pandas as pd
 from pandas_genomics import GenotypeDtype
 
 from .regression import InteractionRegression
@@ -45,9 +45,9 @@ def interaction_study(
         If edge encoding is used, this must be provided.  See Pandas-Genomics documentation on edge encoding.
     report_betas: boolean
         False by default.
-          If True, the results will contain one row for each interaction term and will include the beta value,
-          standard error (SE), and beta pvalue for that specific interaction. The number of terms increases with
-          the number of categories in each interacting variable.
+            If True, the results will contain one row for each interaction term and will include the beta value,
+            standard error (SE), and beta pvalue for that specific interaction. The number of terms increases with
+            the number of categories in each interacting variable.
     min_n: int or None
         Minimum number of complete-case observations (no NA values for outcome, covariates, or variable)
         Defaults to 200
@@ -119,7 +119,7 @@ def interaction_study(
         result = regression.get_results()
 
         # Process Results
-        click.echo(f"Completed Interaction Study for {outcome}\n", color="green")
+        click.echo(f"Completed Interaction Study for {outcome}\n", color=True)
         results.append(result)
 
     if len(outcomes) == 1:
@@ -128,7 +128,10 @@ def interaction_study(
         result = pd.concat(results)
 
     # Sort across multiple outcomes
-    result = result.sort_values(["LRT_pvalue", "Beta_pvalue"])
+    if report_betas:
+        result = result.sort_values(["LRT_pvalue", "Full_Var1_Var2_Pval"])
+    else:
+        result = result.sort_values(["LRT_pvalue"])
 
-    click.echo("Completed association study", color="green")
+    click.echo("Completed association study", color=True)
     return result
diff --git a/clarite/modules/analyze/regression/glm_regression.py b/clarite/modules/analyze/regression/glm_regression.py
@@ -1,7 +1,7 @@
 import multiprocessing
 import re
 from itertools import repeat
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, Generator, List, Optional, Tuple
 
 import click
 import numpy as np
@@ -308,8 +308,14 @@ def _run_binary(data, regression_variable, formula, family, use_t) -> Dict:
 
     @staticmethod
     def _run_categorical(
-        data, formula, formula_restricted, family, use_t, report_categorical_betas
-    ) -> Dict:
+        data,
+        formula,
+        formula_restricted,
+        family,
+        use_t,
+        report_categorical_betas
+        # ) -> Dict:
+    ) -> Generator[dict, None, None]:
         # Regress both models
         y, X = patsy.dmatrices(formula, data, return_type="dataframe", NA_action="drop")
         y = fix_names(y)
@@ -555,4 +561,4 @@ def _run_rv(
             if result is None:
                 result_list = [cls.get_default_result_dict(rv)]
 
-        return result_list, warnings_list, error
+        return result_list, warnings_list, error  # type: ignore
diff --git a/clarite/modules/analyze/regression/interaction_regression.py b/clarite/modules/analyze/regression/interaction_regression.py
@@ -1,6 +1,6 @@
 import multiprocessing
 from itertools import combinations, repeat
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, Generator, List, Optional, Tuple
 
 import click
 import numpy as np
@@ -16,7 +16,7 @@
 from . import GLMRegression
 
 # GITHUB ISSUE #119: Regressions with Error after Multiprocessing release python > 3.8
-multiprocessing.get_start_method("fork")
+# multiprocessing.get_start_method("fork")
 
 
 class InteractionRegression(GLMRegression):
@@ -48,8 +48,8 @@ class InteractionRegression(GLMRegression):
         List of tuples: Test specific interactions of valid variables
     report_betas: boolean
         False by default.
-          If True, the results will contain one row for each interaction term and will include the beta value
-          for that term.  The number of terms increases with the number of categories in each interacting term.
+            If True, the results will contain one row for each interaction term and will include the beta value
+            for that term.  The number of terms increases with the number of categories in each interacting term.
     encoding: str, default "additive"
         Encoding method to use for any genotype data.  One of {'additive', 'dominant', 'recessive', 'codominant', or 'weighted'}
     edge_encoding_info: Optional pd.DataFrame, default None
@@ -109,7 +109,7 @@ def _process_interactions(self, interactions):
             )
         if interactions is None:
             self.interactions = [c for c in combinations(regression_var_list, r=2)]
-        elif type(interactions) == str:
+        elif type(interactions) is str:
             if interactions not in regression_var_list:
                 raise ValueError(
                     f"'{interactions}' was passed as the value for 'interactions' "
@@ -140,16 +140,30 @@ def _process_interactions(self, interactions):
         self.description += f"\nProcessing {len(self.interactions):,} interactions"
 
     @staticmethod
-    def _get_default_result_dict(i1, i2):
+    def _get_default_result_dict(i1, i2, outcome_variable):
         return {
+            "Outcome": outcome_variable,
             "Term1": i1,
             "Term2": i2,
+            "Parameter": str(i1 + ":" + i2),
             "Converged": False,
             "N": np.nan,
-            "Beta": np.nan,
-            "SE": np.nan,
-            "Beta_pvalue": np.nan,
             "LRT_pvalue": np.nan,
+            "Red_Var1_beta": np.nan,
+            "Red_Var1_SE": np.nan,
+            "Red_Var1_Pval": np.nan,
+            "Red_Var2_beta": np.nan,
+            "Red_Var2_SE": np.nan,
+            "Red_Var2_Pval": np.nan,
+            "Full_Var1_Var2_beta": np.nan,
+            "Full_Var1_Var2_SE": np.nan,
+            "Full_Var1_Var2_Pval": np.nan,
+            "Full_Var1_beta": np.nan,
+            "Full_Var1_SE": np.nan,
+            "Full_Var1_Pval": np.nan,
+            "Full_Var2_beta": np.nan,
+            "Full_Var2_SE": np.nan,
+            "Full_Var2_Pval": np.nan,
         }
 
     def get_results(self) -> pd.DataFrame:
@@ -169,17 +183,18 @@ def get_results(self) -> pd.DataFrame:
         result["Outcome"] = self.outcome_variable
         if self.report_betas:
             return result.set_index(
-                ["Term1", "Term2", "Outcome", "Parameter"]
-            ).sort_values(["LRT_pvalue", "Beta_pvalue"])
+                # ["Term1", "Term2", "Outcome", "Parameter"]
+                ["Term1", "Term2", "Outcome"]
+            ).sort_values(["LRT_pvalue", "Full_Var1_Var2_Pval"])
         else:
             return result.set_index(["Term1", "Term2", "Outcome"]).sort_values(
                 ["LRT_pvalue"]
             )
 
     @staticmethod
     def _run_interaction_regression(
-        data, formula, formula_restricted, family, use_t, report_betas
-    ) -> Dict:
+        data, formula, formula_restricted, family, use_t, report_betas, i1, i2
+    ) -> Generator[Dict, None, None]:
         # Regress Full Model
         y, X = patsy.dmatrices(formula, data, return_type="dataframe", NA_action="drop")
         y = fix_names(y)
@@ -201,25 +216,73 @@ def _run_interaction_regression(
             lrdf = est_restricted.df_resid - est.df_resid
             lrstat = -2 * (est_restricted.llf - est.llf)
             lr_pvalue = scipy.stats.chi2.sf(lrstat, lrdf)
-            if report_betas:
-                # Get beta, SE, and pvalue from interaction terms
-                # Where interaction terms are those appearing in the full model and not in the reduced model
-                # Return all terms
-                param_names = set(est.bse.index) - set(est_restricted.bse.index)
-                # The restricted model shouldn't have extra terms, unless there is some case we have overlooked
-                assert len(set(est_restricted.bse.index) - set(est.bse.index)) == 0
-                for param_name in param_names:
-                    yield {
-                        "Converged": True,
-                        "Parameter": param_name,
-                        "Beta": est.params[param_name],
-                        "SE": est.bse[param_name],
-                        "Beta_pvalue": est.pvalues[param_name],
-                        "LRT_pvalue": lr_pvalue,
-                    }
+            # GITHUB/ISSUES 121: Handling LRT_Pvalue when lrstat and lrdf are
+            # both 0. When lrstat (the test statistic) and lrdf (degrees of
+            # freedom for the Likelihood Ratio Test) are both 0, it typically
+            # suggests that both models are equivalent in terms of fit. In
+            # other words, there is no significant difference between the two
+            # models.
+            #
+            # However when both lrstat and lrdf are 0, calc the survival
+            # function (sf) of a chi-squared distribution with 0 degrees of
+            # freedom results in NaN. This is because mathematically, it's
+            # undefined to perform this calculation under these circumstances.
+            #
+            # In such cases, it's important to handle this scenario separately
+            # in the result based on the specific requirements of the analysis
+            if lrdf == 0 and lrstat == 0:
+                # Both models are equal
+                yield {"Converged": False, "LRT_pvalue": lr_pvalue}
+            if np.isnan(lr_pvalue):
+                # There is an issue with the LRT calculation
+                yield {"Converged": False, "LRT_pvalue": lr_pvalue}
             else:
-                # Only return the LRT result
-                yield {"Converged": True, "LRT_pvalue": lr_pvalue}
+                if report_betas:
+                    # Get beta, SE, and pvalue from interaction terms
+                    # Where interaction terms are those appearing in the full
+                    # model and not in the reduced model return all terms
+                    param_names = set(est.bse.index) - set(est_restricted.bse.index)
+                    # The restricted model shouldn't have extra terms, unless
+                    # there is some case we have overlooked.
+                    assert len(set(est_restricted.bse.index) - set(est.bse.index)) == 0
+                    # GITHUB/ISSUES 122: Open to show Terms Betas Values
+                    for param_name in param_names:
+                        # Names defined to aling with PLATO
+                        # Split the input_string by ":"
+                        term_1, term_2 = param_name.split(":")
+                        yield {
+                            "Term1": term_1,
+                            "Term2": term_2,
+                            "Converged": True,
+                            "Parameter": param_name,
+                            # Betas in Reduced Model
+                            # Var1 --> Term 1
+                            "Red_Var1_beta": est_restricted.params[term_1],
+                            "Red_Var1_SE": est_restricted.bse[term_1],
+                            "Red_Var1_Pval": est_restricted.pvalues[term_1],
+                            # Var2 --> Term 2
+                            "Red_Var2_beta": est_restricted.params[term_2],
+                            "Red_Var2_SE": est_restricted.bse[term_2],
+                            "Red_Var2_Pval": est_restricted.pvalues[term_2],
+                            # Betas in Full Model
+                            # Var1 --> Term 1
+                            "Full_Var1_Var2_beta": est.params[param_name],
+                            "Full_Var1_Var2_SE": est.bse[param_name],
+                            "Full_Var1_Var2_Pval": est.pvalues[param_name],
+                            # Var1 --> Term 1
+                            "Full_Var1_beta": est.params[term_1],
+                            "Full_Var1_SE": est.bse[term_1],
+                            "Full_Var1_Pval": est.pvalues[term_1],
+                            # Var2 --> Term 2
+                            "Full_Var2_beta": est.params[term_2],
+                            "Full_Var2_SE": est.bse[term_2],
+                            "Full_Var2_Pval": est.pvalues[term_2],
+                            "LRT_pvalue": lr_pvalue,
+                        }
+                else:
+                    # Only return the LRT result
+                    yield {"Converged": True, "LRT_pvalue": lr_pvalue}
+
         else:
             # Did not converge - nothing to update
             yield dict()
@@ -394,16 +457,24 @@ def _run_interaction(
 
             # Run Regression LRT Test
             for regression_result in cls._run_interaction_regression(
-                data, formula, formula_restricted, family, use_t, report_betas
+                data,
+                formula,
+                formula_restricted,
+                family,
+                use_t,
+                report_betas,
+                i1,
+                i2,
             ):
-                result = cls._get_default_result_dict(i1, i2)
+                result = cls._get_default_result_dict(i1, i2, outcome_variable)
                 result["N"] = N
+                # TODO:
                 result.update(regression_result)
                 result_list.append(result)
 
         except Exception as e:
             error = str(e)
             if result is None:
-                result_list = [cls._get_default_result_dict(i1, i2)]
+                result_list = [cls._get_default_result_dict(i1, i2, outcome_variable)]
 
         return result_list, warnings_list, error
diff --git a/clarite/modules/analyze/regression/weighted_glm_regression.py b/clarite/modules/analyze/regression/weighted_glm_regression.py
@@ -368,7 +368,6 @@ def _run_weighted_rv(
         use_t: bool,
         report_categorical_betas: bool,
     ) -> Tuple[List[dict], List[str], str]:  # results, warnings, errors
-
         # Initialize return values
         result_list = []
         warnings_list = []

diff --git a/clarite/modules/describe.py b/clarite/modules/describe.py
@@ -66,9 +66,7 @@ def correlations(data: pd.DataFrame, threshold: float = 0.75):
         .reset_index()
     )
     # Remove those with correlation below threshold
-    correlation = correlation.loc[
-        correlation["correlation"].abs() >= threshold,
-    ]
+    correlation = correlation.loc[correlation["correlation"].abs() >= threshold,]
     # Sort by absolute value
     correlation = correlation.reindex(
         correlation["correlation"].abs().sort_values(ascending=False).index