Skip to content

Commit

Permalink
Version 2.3.5
Browse files Browse the repository at this point in the history
  • Loading branch information
AndreRicoPSU committed Oct 10, 2023
1 parent 817ccad commit a493334
Show file tree
Hide file tree
Showing 28 changed files with 18,617 additions and 219 deletions.
4 changes: 1 addition & 3 deletions clarite/cli/commands/analyze.py
Expand Up @@ -319,8 +319,6 @@ def get_significant(ewas_result, output, use_fdr, pvalue):
else:
col = "pvalue_bonferroni"
_, data = ewas_result
data = data.loc[
data[col] <= pvalue,
]
data = data.loc[data[col] <= pvalue,]
# Save result
save_clarite_ewas(data, output)
4 changes: 1 addition & 3 deletions clarite/cli/commands/describe.py
Expand Up @@ -38,9 +38,7 @@ def freq_table(data, output):
# Save results
results.to_csv(output, sep="\t", index=False)
# Log
processed = results.loc[
results["value"] != "<Non-Categorical Values>",
]
processed = results.loc[results["value"] != "<Non-Categorical Values>",]
if len(processed) > 0:
num_values = processed[["Variable", "value"]].nunique()
num_variables = processed["Variable"].nunique()
Expand Down
20 changes: 5 additions & 15 deletions clarite/cli/commands/modify.py
Expand Up @@ -242,25 +242,15 @@ def rowfilter(data, output, column, vs, vi, vf, comparison):
value = values[0]
# Filter
if comparison == "lt":
data.df = data.df.loc[
data.df[column] < value,
]
data.df = data.df.loc[data.df[column] < value,]
elif comparison == "lte":
data.df = data.df.loc[
data.df[column] <= value,
]
data.df = data.df.loc[data.df[column] <= value,]
elif comparison == "eq":
data.df = data.df.loc[
data.df[column] == value,
]
data.df = data.df.loc[data.df[column] == value,]
elif comparison == "gt":
data.df = data.df.loc[
data.df[column] >= value,
]
data.df = data.df.loc[data.df[column] >= value,]
elif comparison == "gte":
data.df = data.df.loc[
data.df[column] > value,
]
data.df = data.df.loc[data.df[column] > value,]
# Save
save_clarite_data(data, output)

Expand Down
6 changes: 3 additions & 3 deletions clarite/cli/commands/plot.py
Expand Up @@ -116,7 +116,7 @@ def manhattan(ewas_result, output, categories, bonferroni, fdr, other, nlabeled,
# Load data
name, data = ewas_result
data_dict = {name: data}
for (name, data) in other:
for name, data in other:
data_dict[name] = data
# Load categories, if any
if categories is not None:
Expand Down Expand Up @@ -187,7 +187,7 @@ def manhattan_bonferroni(
# Load data
name, data = ewas_result
data_dict = {name: data}
for (name, data) in other:
for name, data in other:
data_dict[name] = data
# Load categories, if any
if categories is not None:
Expand Down Expand Up @@ -255,7 +255,7 @@ def manhattan_fdr(ewas_result, output, categories, cutoff, other, nlabeled, labe
# Load data
name, data = ewas_result
data_dict = {name: data}
for (name, data) in other:
for name, data in other:
data_dict[name] = data
# Load categories, if any
if categories is not None:
Expand Down
1 change: 1 addition & 0 deletions clarite/internal/utilities.py
Expand Up @@ -25,6 +25,7 @@ def wrapped(*args, **kwargs):

def requires(package_name):
"""Decorator factory to ensure optional packages are imported before running"""

# Define and return an appropriate decorator
def decorator(func):
# Check if package is importable
Expand Down
17 changes: 10 additions & 7 deletions clarite/modules/analyze/interaction_study.py
@@ -1,7 +1,7 @@
from typing import List, Optional, Tuple, Union

import pandas as pd
import click
import pandas as pd
from pandas_genomics import GenotypeDtype

from .regression import InteractionRegression
Expand Down Expand Up @@ -45,9 +45,9 @@ def interaction_study(
If edge encoding is used, this must be provided. See Pandas-Genomics documentation on edge encoding.
report_betas: boolean
False by default.
If True, the results will contain one row for each interaction term and will include the beta value,
standard error (SE), and beta pvalue for that specific interaction. The number of terms increases with
the number of categories in each interacting variable.
If True, the results will contain one row for each interaction term and will include the beta value,
standard error (SE), and beta pvalue for that specific interaction. The number of terms increases with
the number of categories in each interacting variable.
min_n: int or None
Minimum number of complete-case observations (no NA values for outcome, covariates, or variable)
Defaults to 200
Expand Down Expand Up @@ -119,7 +119,7 @@ def interaction_study(
result = regression.get_results()

# Process Results
click.echo(f"Completed Interaction Study for {outcome}\n", color="green")
click.echo(f"Completed Interaction Study for {outcome}\n", color=True)
results.append(result)

if len(outcomes) == 1:
Expand All @@ -128,7 +128,10 @@ def interaction_study(
result = pd.concat(results)

# Sort across multiple outcomes
result = result.sort_values(["LRT_pvalue", "Beta_pvalue"])
if report_betas:
result = result.sort_values(["LRT_pvalue", "Full_Var1_Var2_Pval"])
else:
result = result.sort_values(["LRT_pvalue"])

click.echo("Completed association study", color="green")
click.echo("Completed association study", color=True)
return result
14 changes: 10 additions & 4 deletions clarite/modules/analyze/regression/glm_regression.py
@@ -1,7 +1,7 @@
import multiprocessing
import re
from itertools import repeat
from typing import Dict, List, Optional, Tuple
from typing import Dict, Generator, List, Optional, Tuple

import click
import numpy as np
Expand Down Expand Up @@ -308,8 +308,14 @@ def _run_binary(data, regression_variable, formula, family, use_t) -> Dict:

@staticmethod
def _run_categorical(
data, formula, formula_restricted, family, use_t, report_categorical_betas
) -> Dict:
data,
formula,
formula_restricted,
family,
use_t,
report_categorical_betas
# ) -> Dict:
) -> Generator[dict, None, None]:
# Regress both models
y, X = patsy.dmatrices(formula, data, return_type="dataframe", NA_action="drop")
y = fix_names(y)
Expand Down Expand Up @@ -555,4 +561,4 @@ def _run_rv(
if result is None:
result_list = [cls.get_default_result_dict(rv)]

return result_list, warnings_list, error
return result_list, warnings_list, error # type: ignore
139 changes: 105 additions & 34 deletions clarite/modules/analyze/regression/interaction_regression.py
@@ -1,6 +1,6 @@
import multiprocessing
from itertools import combinations, repeat
from typing import Dict, List, Optional, Tuple
from typing import Dict, Generator, List, Optional, Tuple

import click
import numpy as np
Expand All @@ -16,7 +16,7 @@
from . import GLMRegression

# GITHUB ISSUE #119: Regressions with Error after Multiprocessing release python > 3.8
multiprocessing.get_start_method("fork")
# multiprocessing.get_start_method("fork")


class InteractionRegression(GLMRegression):
Expand Down Expand Up @@ -48,8 +48,8 @@ class InteractionRegression(GLMRegression):
List of tuples: Test specific interactions of valid variables
report_betas: boolean
False by default.
If True, the results will contain one row for each interaction term and will include the beta value
for that term. The number of terms increases with the number of categories in each interacting term.
If True, the results will contain one row for each interaction term and will include the beta value
for that term. The number of terms increases with the number of categories in each interacting term.
encoding: str, default "additive"
Encoding method to use for any genotype data. One of {'additive', 'dominant', 'recessive', 'codominant', or 'weighted'}
edge_encoding_info: Optional pd.DataFrame, default None
Expand Down Expand Up @@ -109,7 +109,7 @@ def _process_interactions(self, interactions):
)
if interactions is None:
self.interactions = [c for c in combinations(regression_var_list, r=2)]
elif type(interactions) == str:
elif type(interactions) is str:
if interactions not in regression_var_list:
raise ValueError(
f"'{interactions}' was passed as the value for 'interactions' "
Expand Down Expand Up @@ -140,16 +140,30 @@ def _process_interactions(self, interactions):
self.description += f"\nProcessing {len(self.interactions):,} interactions"

@staticmethod
def _get_default_result_dict(i1, i2):
def _get_default_result_dict(i1, i2, outcome_variable):
return {
"Outcome": outcome_variable,
"Term1": i1,
"Term2": i2,
"Parameter": str(i1 + ":" + i2),
"Converged": False,
"N": np.nan,
"Beta": np.nan,
"SE": np.nan,
"Beta_pvalue": np.nan,
"LRT_pvalue": np.nan,
"Red_Var1_beta": np.nan,
"Red_Var1_SE": np.nan,
"Red_Var1_Pval": np.nan,
"Red_Var2_beta": np.nan,
"Red_Var2_SE": np.nan,
"Red_Var2_Pval": np.nan,
"Full_Var1_Var2_beta": np.nan,
"Full_Var1_Var2_SE": np.nan,
"Full_Var1_Var2_Pval": np.nan,
"Full_Var1_beta": np.nan,
"Full_Var1_SE": np.nan,
"Full_Var1_Pval": np.nan,
"Full_Var2_beta": np.nan,
"Full_Var2_SE": np.nan,
"Full_Var2_Pval": np.nan,
}

def get_results(self) -> pd.DataFrame:
Expand All @@ -169,17 +183,18 @@ def get_results(self) -> pd.DataFrame:
result["Outcome"] = self.outcome_variable
if self.report_betas:
return result.set_index(
["Term1", "Term2", "Outcome", "Parameter"]
).sort_values(["LRT_pvalue", "Beta_pvalue"])
# ["Term1", "Term2", "Outcome", "Parameter"]
["Term1", "Term2", "Outcome"]
).sort_values(["LRT_pvalue", "Full_Var1_Var2_Pval"])
else:
return result.set_index(["Term1", "Term2", "Outcome"]).sort_values(
["LRT_pvalue"]
)

@staticmethod
def _run_interaction_regression(
data, formula, formula_restricted, family, use_t, report_betas
) -> Dict:
data, formula, formula_restricted, family, use_t, report_betas, i1, i2
) -> Generator[Dict, None, None]:
# Regress Full Model
y, X = patsy.dmatrices(formula, data, return_type="dataframe", NA_action="drop")
y = fix_names(y)
Expand All @@ -201,25 +216,73 @@ def _run_interaction_regression(
lrdf = est_restricted.df_resid - est.df_resid
lrstat = -2 * (est_restricted.llf - est.llf)
lr_pvalue = scipy.stats.chi2.sf(lrstat, lrdf)
if report_betas:
# Get beta, SE, and pvalue from interaction terms
# Where interaction terms are those appearing in the full model and not in the reduced model
# Return all terms
param_names = set(est.bse.index) - set(est_restricted.bse.index)
# The restricted model shouldn't have extra terms, unless there is some case we have overlooked
assert len(set(est_restricted.bse.index) - set(est.bse.index)) == 0
for param_name in param_names:
yield {
"Converged": True,
"Parameter": param_name,
"Beta": est.params[param_name],
"SE": est.bse[param_name],
"Beta_pvalue": est.pvalues[param_name],
"LRT_pvalue": lr_pvalue,
}
# GITHUB/ISSUES 121: Handling LRT_Pvalue when lrstat and lrdf are
# both 0. When lrstat (the test statistic) and lrdf (degrees of
# freedom for the Likelihood Ratio Test) are both 0, it typically
# suggests that both models are equivalent in terms of fit. In
# other words, there is no significant difference between the two
# models.
#
# However when both lrstat and lrdf are 0, calc the survival
# function (sf) of a chi-squared distribution with 0 degrees of
# freedom results in NaN. This is because mathematically, it's
# undefined to perform this calculation under these circumstances.
#
# In such cases, it's important to handle this scenario separately
# in the result based on the specific requirements of the analysis
if lrdf == 0 and lrstat == 0:
# Both models are equal
yield {"Converged": False, "LRT_pvalue": lr_pvalue}
if np.isnan(lr_pvalue):
# There is an issue with the LRT calculation
yield {"Converged": False, "LRT_pvalue": lr_pvalue}
else:
# Only return the LRT result
yield {"Converged": True, "LRT_pvalue": lr_pvalue}
if report_betas:
# Get beta, SE, and pvalue from interaction terms
# Where interaction terms are those appearing in the full
# model and not in the reduced model return all terms
param_names = set(est.bse.index) - set(est_restricted.bse.index)
# The restricted model shouldn't have extra terms, unless
# there is some case we have overlooked.
assert len(set(est_restricted.bse.index) - set(est.bse.index)) == 0
# GITHUB/ISSUES 122: Open to show Terms Betas Values
for param_name in param_names:
# Names defined to aling with PLATO
# Split the input_string by ":"
term_1, term_2 = param_name.split(":")
yield {
"Term1": term_1,
"Term2": term_2,
"Converged": True,
"Parameter": param_name,
# Betas in Reduced Model
# Var1 --> Term 1
"Red_Var1_beta": est_restricted.params[term_1],
"Red_Var1_SE": est_restricted.bse[term_1],
"Red_Var1_Pval": est_restricted.pvalues[term_1],
# Var2 --> Term 2
"Red_Var2_beta": est_restricted.params[term_2],
"Red_Var2_SE": est_restricted.bse[term_2],
"Red_Var2_Pval": est_restricted.pvalues[term_2],
# Betas in Full Model
# Var1 --> Term 1
"Full_Var1_Var2_beta": est.params[param_name],
"Full_Var1_Var2_SE": est.bse[param_name],
"Full_Var1_Var2_Pval": est.pvalues[param_name],
# Var1 --> Term 1
"Full_Var1_beta": est.params[term_1],
"Full_Var1_SE": est.bse[term_1],
"Full_Var1_Pval": est.pvalues[term_1],
# Var2 --> Term 2
"Full_Var2_beta": est.params[term_2],
"Full_Var2_SE": est.bse[term_2],
"Full_Var2_Pval": est.pvalues[term_2],
"LRT_pvalue": lr_pvalue,
}
else:
# Only return the LRT result
yield {"Converged": True, "LRT_pvalue": lr_pvalue}

else:
# Did not converge - nothing to update
yield dict()
Expand Down Expand Up @@ -394,16 +457,24 @@ def _run_interaction(

# Run Regression LRT Test
for regression_result in cls._run_interaction_regression(
data, formula, formula_restricted, family, use_t, report_betas
data,
formula,
formula_restricted,
family,
use_t,
report_betas,
i1,
i2,
):
result = cls._get_default_result_dict(i1, i2)
result = cls._get_default_result_dict(i1, i2, outcome_variable)
result["N"] = N
# TODO:
result.update(regression_result)
result_list.append(result)

except Exception as e:
error = str(e)
if result is None:
result_list = [cls._get_default_result_dict(i1, i2)]
result_list = [cls._get_default_result_dict(i1, i2, outcome_variable)]

return result_list, warnings_list, error
Expand Up @@ -368,7 +368,6 @@ def _run_weighted_rv(
use_t: bool,
report_categorical_betas: bool,
) -> Tuple[List[dict], List[str], str]: # results, warnings, errors

# Initialize return values
result_list = []
warnings_list = []
Expand Down
4 changes: 1 addition & 3 deletions clarite/modules/describe.py
Expand Up @@ -66,9 +66,7 @@ def correlations(data: pd.DataFrame, threshold: float = 0.75):
.reset_index()
)
# Remove those with correlation below threshold
correlation = correlation.loc[
correlation["correlation"].abs() >= threshold,
]
correlation = correlation.loc[correlation["correlation"].abs() >= threshold,]
# Sort by absolute value
correlation = correlation.reindex(
correlation["correlation"].abs().sort_values(ascending=False).index
Expand Down

0 comments on commit a493334

Please sign in to comment.