From fbfb01672fdac7c64be7e8ba73fc2bd7bef8ec9a Mon Sep 17 00:00:00 2001 From: John McGuigan Date: Fri, 16 Oct 2020 13:54:18 -0400 Subject: [PATCH] Make R-based regression use the 'drop_unweighted' parameter and update the weight name the same way --- .../analyze/regression/r_code/ewas_r.R | 27 ++++++++++++------- .../analyze/regression/r_survey_regression.py | 1 + .../regression/weighted_glm_regression.py | 1 + 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/clarite/modules/analyze/regression/r_code/ewas_r.R b/clarite/modules/analyze/regression/r_code/ewas_r.R index 3afa513..f95070a 100644 --- a/clarite/modules/analyze/regression/r_code/ewas_r.R +++ b/clarite/modules/analyze/regression/r_code/ewas_r.R @@ -228,7 +228,7 @@ regress_cat_survey <- function(data, varying_covariates, phenotype, var_name, re # General Regression function which applies some filters/tests before calling the actual regression regress <- function(data, y, var_name, covariates, min_n, allowed_nonvarying, regression_family, var_type, - use_survey, single_weight, weights, strata, fpc, ids, subset_array, ...){ + use_survey, single_weight, weights, strata, fpc, ids, subset_array, drop_unweighted, ...){ # The result list will be used to update results for this variable result = list() @@ -263,19 +263,26 @@ regress <- function(data, y, var_name, covariates, min_n, allowed_nonvarying, re } # Get weight values, returning early if there is a problem with the weight if(!(weight %in% names(data))){ + # Weight values are missing warning(paste(var_name, " had a NULL result because its weight (", weight, ") was not found")) result$weight <- paste(weight, " (not found)") return(data.frame(result, stringsAsFactors = FALSE)) - } else if(sum(!is.na(data[var_name]) & is.na(data[weight])) > 0){ - warning(paste(var_name, " had a NULL result because its weight (", weight, ") had ", sum(is.na(data[weight])), " missing values when the variable was not missing")) - result$weight <- paste(weight, " (missing values)") - return(data.frame(result, stringsAsFactors = FALSE)) - } else { - # Get weights - weight_values <- data[weight] - # Fill NA weight values with 0 to pass an internal check by survey - weight_values[is.na(weight_values),] <- 0 } + missing_weight_count <- sum(!is.na(data[var_name]) & is.na(data[weight]) & subset_data) + if(missing_weight_count > 0){ + # Some weights in the subset are missing when the variable is not + warning(paste(var_name, " had a NULL result because its weight (", weight, ") had ", missing_weight_count, " missing values when the variable was not missing")) + result$weight <- paste0(weight, " (", missing_weight_count, " observations are missing weights)") + if (!drop_unweighted){ + # Return early with no result if dropping unweighted was not enabled + return(data.frame(result, stringsAsFactors = FALSE)) + } + } + # Get weights + weight_values <- data[weight] + # Fill NA weight values with 0 to pass an internal check by survey + weight_values[is.na(weight_values),] <- 0 + # Load strata, fpc, and ids if(!is.null(strata)){ strata_values <- data[strata] diff --git a/clarite/modules/analyze/regression/r_survey_regression.py b/clarite/modules/analyze/regression/r_survey_regression.py index 13b149a..2081a70 100644 --- a/clarite/modules/analyze/regression/r_survey_regression.py +++ b/clarite/modules/analyze/regression/r_survey_regression.py @@ -198,6 +198,7 @@ def run(self): min_n=self.min_n, weights=weights, subset=self.survey_design_spec.subset_array, + drop_unweighted=self.survey_design_spec.drop_unweighted, **kwargs) result = ewasresult2py(result) diff --git a/clarite/modules/analyze/regression/weighted_glm_regression.py b/clarite/modules/analyze/regression/weighted_glm_regression.py index 818b448..b3f3ac9 100644 --- a/clarite/modules/analyze/regression/weighted_glm_regression.py +++ b/clarite/modules/analyze/regression/weighted_glm_regression.py @@ -184,6 +184,7 @@ def run(self): weight_name, missing_weight_mask, warning = self.survey_design_spec.check_missing_weights(data, rv) if warning is not None: self.warnings[rv].append(warning) + self.results[rv]["Weight"] = weight_name # Get complete case mask complete_case_mask = self.get_complete_case_mask(data, rv) # Complete cases