# Phase 5: QC NHANES Data

Documentation: https://halllab.atlassian.net/wiki/spaces/IGEM/pages/79233025/Phase+5+QC+NHANES+Data

In [1]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
from functools import reduce
from pathlib import Path
import matplotlib.pyplot as plt
import igem

rpy2 ModuleSpec(name='rpy2', loader=<_frozen_importlib_external.SourceFileLoader object at 0x3548b1f00>, origin='/Users/andrerico/Works/Projects/pbs_igem/.venv/lib/python3.10/site-packages/rpy2/__init__.py', submodule_search_locations=['/Users/andrerico/Works/Projects/pbs_igem/.venv/lib/python3.10/site-packages/rpy2'])


In [2]:
# Defining the path to the data folder
path = Path().resolve()
path_data = path / "data"

In [3]:
# Reading the NHANES data with normalized medications
df_nhanes = igem.epc.load.from_csv(
    str(path_data) + "/step_04_02_nhanes_data_with_medications.csv"
    )
# print(f"Start QC Process with: {len(df_nhanes)} records")

# Reading the fields to run the QC
ls_fields = pd.read_csv(
    (str(path_data) + "/step_02_07_Fields.csv"),
    )['field_name'].tolist()

# add the normalized medication fields
ls_fields.extend(['ID', 'Cycle', 'LBDLDL_N', 'LBXTC_N'])
print(f"Fields to run QC: {len(ls_fields)} records")

Loaded 101,316 observations of 917 variables
Fields to run QC: 1140 records


  return clarite.load.from_csv(filename, index_col, **kwargs)


#### STEP 05_00: Columns Check

In [4]:
# Syncing the fields to the NHANES data
existing_fields = [field for field in ls_fields if field in df_nhanes.columns]
df_nhanes = df_nhanes[existing_fields]

# Droping the rows with missing values
df_nhanes = df_nhanes.dropna(axis=1, how='all')

# Droping duplicated columns
df_nhanes = df_nhanes.loc[:, ~df_nhanes.columns.duplicated()]

Change dummy categories to NaN

In [5]:
# TODO: Check with Nikki

# Function to replace specific values with NaN in a column
def replace_values_with_nan(df, column_name, values_to_replace):
    if column_name in df.columns:
        df[column_name] = df[column_name].replace(values_to_replace, np.nan)
    else:
        print(f"Column '{column_name}' does not exist in the DataFrame.")
    return df

In [6]:
# Replace Binary Fields with values 77 and 99 to NaN
column_name = 'SLD010H'
values_to_replace = [77, 99]
# Aplicar a função
df_nhanes = replace_values_with_nan(df_nhanes, column_name, values_to_replace)

In [7]:
# Checking the unique values of the column
df_nhanes['SLD010H'].unique()

array([ 6., nan,  8.,  5.,  7.,  9., 11.,  4., 10.,  2., 12.,  3.,  1.])

### STEP 05_01: Filter Target Samples

In [8]:
# Slip data into two groups (age < 18 and age >= 18)
# Many records do not have age information, so we will drop them (sequence > 0)
df_nhanes_lt = df_nhanes.loc[df_nhanes["RIDAGEYR"] < 18]
df_nhanes_gt = df_nhanes.loc[df_nhanes["RIDAGEYR"] >= 18]

print(f'< 18: {len(df_nhanes_lt)}')
print(f'>= 18: {len(df_nhanes_gt)}')
print(f'Total: {len(df_nhanes_lt)+len(df_nhanes_gt)}')

< 18: 42112
>= 18: 59204
Total: 101316


In [9]:
# Drop rows with missing values in the cofounders
columns_to_check = ['RIDAGEYR', 'RIAGENDR', 'RIDRETH1', 'BMXBMI']
df_nhanes_gt = df_nhanes_gt.dropna(subset=columns_to_check)
print(f"After drop NA in all Cofounders: {len(df_nhanes_gt)} records")

After drop NA in all Cofounders: 55206 records


### STEP 05_02: Categorize Fields

In [10]:
# # Categories the columns types
df_nhanes_gt = igem.epc.modify.categorize(df_nhanes_gt)

Running categorize
--------------------------------------------------------------------------------
96 of 834 variables (11.51%) are classified as constant (1 unique value).
199 of 834 variables (23.86%) are classified as binary (2 unique values).
106 of 834 variables (12.71%) are classified as categorical (3 to 6 unique values).
342 of 834 variables (41.01%) are classified as continuous (>= 15 unique values).
58 of 834 variables (6.95%) were dropped.
	58 variables had zero unique values (all NA).
33 of 834 variables (3.96%) were not categorized and need to be set manually.
	30 variables had between 6 and 15 unique values
	3 variables had >= 15 values but couldn't be converted to continuous (numeric) values


In [11]:
# Information about the columns not categorized and need to be set manually
ls_object_columns = df_nhanes_gt.select_dtypes(include=['object']).columns
print(
    f"Number of object columns:  {len(ls_object_columns)} \n" 
)

print("The list of object columns: ")
print(f"{ls_object_columns} \n")

df_nhanes_gt.select_dtypes(include=['object']).to_csv(str(path_data) + '/step_05_01_Object_Columns.csv')
print(f"One file was create with the Object columns: ")
print(f"{str(path_data) + '/step_05_01_Object_Columns.csv'}")

Number of object columns:  33 

The list of object columns: 
Index(['SMD100BR', 'SMQ143', 'SMQ078', 'SMQ621', 'SMQ660', 'SSDCP', 'LBXV1D',
       'LBXV2T', 'DBD195', 'DBQ197', 'DBD235A', 'DBD235AE', 'DBD235B',
       'DBD235C', 'DBD235CE', 'CBQ050', 'CBQ800', 'LBXBFOA', 'GTDSCMMN',
       'LBX10AL', 'URXPTU', 'LBDBGESI', 'SLD010H', 'SLQ300', 'SLQ320',
       'DSQTCAFF', 'DUQ272', 'DUQ352', 'SMQ750', 'ALQ142', 'SMD780', 'URXMET',
       'Cycle'],
      dtype='object') 

One file was create with the Object columns: 
/Users/andrerico/Works/Projects/pbs_igem/data/step_05_01_Object_Columns.csv


In [12]:
# Set the cycle column to a categorical variable and order it
cycle_order = [
    '1999-2000',
    '2001-2002',
    '2003-2004',
    '2005-2006',
    '2007-2008',
    '2009-2010',
    '2011-2012',
    '2013-2014',
    '2015-2016',
    '2017-2018'
]
df_nhanes_gt['Cycle'] = pd.Categorical(df_nhanes_gt['Cycle'], categories=cycle_order, ordered=True)

# Manuaklly set the columns to the correct type
df_nhanes_gt = igem.epc.modify.make_categorical(
    df_nhanes_gt,
    only=['SMQ143','SMQ078','SMQ621','SMQ660','DBD195','DBQ197','DBD235A','DBD235AE','DBD235B','DBD235C','DBD235CE','CBQ050','CBQ800','GTDSCMMN','SLD010H','DUQ272','DUQ352','SMQ750','ALQ142','SMD780'])

df_nhanes_gt = igem.epc.modify.make_continuous(
    df_nhanes_gt,
    only=['SSDCP','LBXV1D','LBXV2T','LBXBFOA','LBX10AL','URXPTU','LBDBGESI','DSQTCAFF','URXMET']
    )

# TODO: Check with Nikki
# Fields that was categorized as Categorical but are Continuous
df_nhanes_gt = igem.epc.modify.make_continuous(
    df_nhanes_gt,
    only=['SLD010H']
    )

# Text fields that will be dropped
df_nhanes_gt = igem.epc.modify.colfilter(
    df_nhanes_gt,
    skip=['SLQ300','SLQ320','SMD100BR',]
    )

Running make_categorical
--------------------------------------------------------------------------------
Set 20 of 776 variable(s) as categorical, each with 55,206 observations
Running make_continuous
--------------------------------------------------------------------------------
Set 9 of 776 variable(s) as continuous, each with 55,206 observations
Running colfilter
--------------------------------------------------------------------------------
Keeping 773 of 776 variables:
	199 of 199 binary variables
	127 of 127 categorical variables
	351 of 351 continuous variables
	0 of 3 unknown variables


In [13]:
# check if exist any Object Data Type
ls_object_columns = df_nhanes_gt.select_dtypes(include=['object']).columns
print(
    f"Number of object columns:  {len(ls_object_columns)} \n" 
)

igem.epc.describe.summarize(df_nhanes_gt)

Number of object columns:  0 

55,206 observations of 773 variables
	199 Binary Variables
	127 Categorical Variables
	351 Continuous Variables
	0 Unknown-Type Variables



### STEP 05_03: Phenotypes and Exposures QC

STEP 05_03_01

In [14]:
# Define the groups
#  Outcomes = Phenotypes
#  Covariants = Cofounders
#  Variants = Exposures
# TODO: Set this list in a parameter file
list_covariats = ['RIDAGEYR', 'RIAGENDR', 'RIDRETH1', 'BMXBMI', 'Cycle']
list_outcomes_wo_adj = ['LBDLDL', 'LBXTC']
list_outcomes = ['LBDLDL_N', 'LBXTC_N', 'LBXSTR', 'LBDHDL', 'LBXHDD', 'LBDHDD']

# Define the list of exposes
excluded_columns = set(list_covariats + list_outcomes + list_outcomes_wo_adj)
list_variants = [col for col in df_nhanes_gt.columns if col not in excluded_columns]

### STEP 05_04: Define Functions

STEP 05_04_01

In [15]:
def sync_columns (df1, df2):
    # Syncs for replication and discovery datasets to have the same columns
    # get the common columns
    common_columns = df1.columns.intersection(df2.columns)

    # filter both DataFrames to keep only the common columns
    df1 = df1[common_columns]
    df2 = df2[common_columns]

    # check if both groups as the same number of columns
    n_df1 = len(df1.columns)
    n_df2 = len(df2.columns)

    if n_df1 != n_df2:
        # Raise an error if the number of columns is different
        print(f"---> DF1 has {n_df1} columns and DF2 has {n_df2} columns")
        print("---> Columns in DF1 but not in DF2:")
        print(set(df1.columns) - set(df2.columns))
        print("---> Columns in DF2 but not in DF1:")
        print(set(df2.columns) - set(df1.columns))
        return False, df1, df2
    
    print(f"---> {n_df1} columns and {len(df1)} rows on DF1 dataset")
    print(f"---> {n_df2} columns and {len(df2)} rows on DF2 dataset")
    
    return True, df1, df2

STEP 05_04_02

In [17]:
class ColumnSyncError(Exception):
    pass

# Function to split datasets
def split_datasets(
        df,
        outcome,
        list_covariats,
        list_variants,
        split_col='Cycle',
        split_value='2007-2008',
        random_split=False,
        discovery_percent=0.7
        ):

    try:
        # Filter the DataFrames to keep only the columns of interest
        df = df[[outcome] + list_covariats + list_variants].dropna(subset=[outcome])

        if random_split:
            # Random split based on the specified discovery percentage
            df = df.sample(frac=1, random_state=42)  # Shuffle the DataFrame
            # df = df.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle the DataFrame
            discovery_size = int(discovery_percent * len(df))
            
            df_discovery = df[:discovery_size]
            df_replication = df[discovery_size:]
        else:
            # Split based on the split_value in split_col
            df_discovery = df[df[split_col] <= split_value]
            df_replication = df[df[split_col] > split_value]

        # Drop columns with all NaN values in both DataFrames
        df_discovery = df_discovery.dropna(axis=1, how='all')
        df_replication = df_replication.dropna(axis=1, how='all')

        # Syncs for replication and discovery datasets to have the same columns
        check_integrid, df_discovery, df_replication = sync_columns(
            df_discovery,
            df_replication
        )

        if check_integrid:
            return df_discovery, df_replication
        else:
            raise ColumnSyncError("Column synchronization failed between discovery and replication datasets.")
    
    except Exception as e:
        print(f"Error: {e}")
        return None, None

STEP 05_04_03

In [18]:
# QC to Outcome Function
def outcome_qc(df, outcome, group, plot=True):
    df = df[[outcome]]

    # Remove phenotypes with more than 90% missing values
    df = igem.epc.modify.colfilter_percent_zero(df, filter_percent=90)

    # Get skew value
    skew_value = igem.epc.describe.skewness(
        df,
        dropna='True'
        ).loc[0, 'skew']
    
    # Log transform all phenotypes since the skewness values are greater than abs(0.5)
    if abs(skew_value) > 0.5:
        df_log = igem.epc.modify.transform(df, 'log')
        # Run skewness again after log transf
        skew_value_log = igem.epc.describe.skewness(
            df_log,
            dropna='True'
            ).loc[0, 'skew']
        
        if plot:
            # # Plot the outcome distribution
            fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharey=True)
            
            # Plot original data
            axes[0].hist(df[outcome].dropna(), bins=100, color='blue', alpha=0.7)
            axes[0].set_title(f"{group}: Skew to {outcome} = {skew_value:.6f}")
            axes[0].set_xlabel(outcome)
            axes[0].set_ylabel('Frequency')

            # Plot log-transformed data
            axes[1].hist(df_log[outcome].dropna(), bins=100, color='green', alpha=0.7)
            axes[1].set_title(f"{group}: Skew to {outcome} after log transf = {skew_value_log:.6f}")
            axes[1].set_xlabel(outcome)
            
            plt.tight_layout()
            plt.show()
        
        return df_log

    else:
        return df
    
    

STEP 05_04_04

In [19]:
# QC to Variant Function
def variant_qc(df, list_variants):
    # Filtering the variant columns that exist in the discovery DataFrame
    existing_variants = [var for var in list_variants if var in df.columns]
    df_variants = df[existing_variants]

    # Separate categorical and continuous variables
    objecy_columns = df_variants.select_dtypes(include=['object']).columns
    categorical_columns = df_variants.select_dtypes(include=['category']).columns
    continuous_columns = df_variants.select_dtypes(include=['number']).columns
    # print(f"---> Number of object columns:  {len(objecy_columns)}")
    # print(f"---> Number of categorical columns:  {len(categorical_columns)}")
    # print(f"---> Number of continuous columns:  {len(continuous_columns)}")


    ### QC in Categorical Exposures
    # Separate categorical and continuous variables
    df_categorical = df_variants[categorical_columns]
    df_continuous = df_variants[continuous_columns]


    # Apply colfilter_min_n to categorical columns
    df_categorical = igem.epc.modify.colfilter_min_n(df_categorical, n=200)
    # Apply cat filter and colfilter_min_cat_n to categorical columns
    df_categorical = igem.epc.modify.colfilter_min_cat_n(df_categorical, n=10)

    # QC to binary fields with more than 2 values
    categorical_columns = df_categorical.select_dtypes('category').columns
    for col in categorical_columns:
        unique_values = df[col].dropna().unique()
        if len(unique_values) > 2:
            print("----- ATTENTION -----")
            print(f"{col} is categorical. Check if it is binary")
            # print(f"Field: {col}")
            print(f"Values: {unique_values}\n")

    # # Summary QC in Categorical Exposures
    # print(f"Started process with {len(categorical_columns)} columns\n")
    # print(f"After QC:")
    # igem.epc.describe.summarize(df_categorical) 
    
    #  TODO: to comment or descomment a blok of line (cmd + /)


    ### QC in Continuous Exposures
    df_continuous = igem.epc.modify.colfilter_percent_zero(df_continuous, filter_percent=90)
    df_continuous = igem.epc.modify.colfilter_min_n(df_continuous, n=200)

    # # Summary QC in Continuous Exposures
    # print(f"Started process with {len(continuous_columns)} columns\n")
    # print(f"After QC:")
    # igem.epc.describe.summarize(df_continuous)
    

    # Combine filtered categorical and continuous columns
    df_variants = pd.concat([df_categorical, df_continuous], axis=1)
    igem.epc.describe.summarize(df_variants)


    return df_variants

STEP 05_04_05

In [19]:
# Merge functions
def merge_dfs(left, right):
    return left.merge(right, on='ID', how='left')

### STEP 05_05: PROCESS QC by Phenotype

In [57]:
# Loop to process each Outcome
results = []
# list_outcomes = ['LBXTC_N',] # DEBUG ONLY/

for outcome in list_outcomes:

    print(f"Start QC to {outcome}")

    # Split Datasets - Discovery and Replication
    # -------------------------------------------------------------------------
    print("\n")
    print(f"----- Start Split the Datasets to {outcome} ------")
    
    if outcome in ['LBDLDL_N', 'LBXTC_N', 'LBXSTR']:
        df_discovery, df_replication = split_datasets(
            df_nhanes_gt,
            outcome,
            list_covariats,
            list_variants,
            split_col='Cycle',
            split_value='2007-2008' # this cicle will keep in the discovery dataset
            )
    elif outcome in ['LBDHDL']:
        df_discovery, df_replication = split_datasets(
            df_nhanes_gt,
            outcome,
            list_covariats,
            list_variants,
            split_col='Cycle',
            split_value='1999-2000'
            )
    elif outcome in ['LBDHDD']:
        df_discovery, df_replication = split_datasets(
            df_nhanes_gt,
            outcome,
            list_covariats,
            list_variants,
            split_col='Cycle',
            split_value='2011-2012'
            )
    # LBXHDD will be split randomly (has just one cycle)
    elif outcome in ['LBXHDD']:
        df_discovery, df_replication = split_datasets(
            df_nhanes_gt,
            outcome,
            list_covariats,
            list_variants,
            random_split=True,
            discovery_percent=0.7
            )
    
    # Raise error if got split problems
    if df_discovery is None or df_replication is None:
        print(f"Failed to split datasets for outcome: {outcome}")
        continue


    # Apply the Outcome QC
    # -------------------------------------------------------------------------
    print("\n")
    print(f"----- Start Outcome QC to {outcome} ------")
    # df_outcome_discovery = df_discovery[[outcome]]
    # df_outcome_replication = df_replication[[outcome]]
    print(f"QC to Discovery Outcome Dataset")
    df_outcome_discovery = outcome_qc(
        df_discovery,
        outcome,
        group='discovery',
        plot=False
        )
    print(f"QC to Replicate Outcome Dataset")
    df_outcome_replication = outcome_qc(
        df_replication,
        outcome,
        group='replication',
        plot=False
        )

    # Check Outcome QC Results
    if df_outcome_discovery is None or df_outcome_replication is None:
        print(f"outcome {outcome} removed due to high missing values.")
        continue


    ### Apply the Exposure QC
    # -------------------------------------------------------------------------
    # 
    print("\n")
    print(f"----- Start Variants QC to {outcome} ------")
    print(f"QC to Discovery Variants Dataset")
    df_variants_discovery = variant_qc(
        df_discovery,
        list_variants,
        )
    print(f"QC to Replicate Variants Dataset")
    df_variants_replication = variant_qc(
        df_replication,
        list_variants,
        )
    
    check_integrid, df_variants_discovery, df_variants_replication = sync_columns(
        df_variants_discovery,
        df_variants_replication
        )



    if not check_integrid:
        raise ColumnSyncError("Column synchronization in variants failed between discovery and replication datasets.")

    print(f"End QC to {outcome}")

    # Apply the Covariants QC
    # -------------------------------------------------------------------------
    df_covariants_discovery = df_discovery[list_covariats]
    df_covariants_replication = df_replication[list_covariats]


    # Aggregation QC data
    # -------------------------------------------------------------------------
    
    # Merge Discovery Data
    dfs_to_merge = [
        df_outcome_discovery,
        df_covariants_discovery,
        df_variants_discovery
        ]
    df_qc_discovery = reduce(merge_dfs, dfs_to_merge)

    # Merge Replication Data
    dfs_to_merge_replication = [
        df_outcome_replication,
        df_covariants_replication,
        df_variants_replication
        ]
    df_qc_replication = reduce(merge_dfs, dfs_to_merge_replication)

    # Add group column
    df_qc_discovery['group'] = 'discovery'
    df_qc_replication['group'] = 'replication'

    # Concat discovery and replication dataframes
    df_combined = pd.concat([df_qc_discovery, df_qc_replication])
    df_combined = igem.epc.modify.make_categorical(df_combined, only=["group",])
    
    # Check if any columns is Object
    # Information about the columns not categorized and need to be set manually
    ls_object_columns = df_combined.select_dtypes(include=['object']).columns
    if len(ls_object_columns) != 0:
        print(
            f"Number of object columns:  {len(ls_object_columns)} \n" 
            )
        print("The list of object columns: ")
        print(f"{ls_object_columns} \n")
        igem.epc.describe.summarize(df_combined)
        print(" --- Solve this before save the data ---")
    else:
        # Save as Parquet to keep the data types
        file_name = str(path_data) + "/step_05_05/QC_NHANES_" + outcome + ".pkl"
        df_combined.to_pickle(file_name)
        print(f"QC Data saved in {file_name}\n\n")

Start QC to LBDLDL_N


----- Start Split the Datasets to LBDLDL_N ------
---> 274 columns and 11453 rows on DF1 dataset
---> 274 columns and 12695 rows on DF2 dataset


----- Start Outcome QC to LBDLDL_N ------
QC to Discovery Outcome Dataset
Running colfilter_percent_zero
--------------------------------------------------------------------------------
Testing 1 of 1 continuous variables
	Removed 0 (0.00%) tested continuous variables which were equal to zero in at least 90.00% of non-NA observations.
Running transform
--------------------------------------------------------------------------------
Transformed 'LBDLDL_N' using 'log'
QC to Replicate Outcome Dataset
Running colfilter_percent_zero
--------------------------------------------------------------------------------
Testing 1 of 1 continuous variables
	Removed 0 (0.00%) tested continuous variables which were equal to zero in at least 90.00% of non-NA observations.
Running transform
-----------------------------------------------

  return op.get_result()
  df_qc_discovery['group'] = 'discovery'
  df_qc_replication['group'] = 'replication'


-----------------------

# DEBUG

In [20]:
results = []
outcome = 'LBXTC_N'

Split Discovery and Replicate Datasets

In [21]:
df_discovery, df_replication = split_datasets(
        df_nhanes_gt,
        outcome,
        list_covariats,
        list_variants,
        split_col='Cycle',
        split_value='2007-2008'
)
if df_discovery is None or df_replication is None:
    print(f"Failed to split datasets for outcome: {outcome}")
    raise ValueError("Failed to split datasets for outcome: {outcome}")

---> 274 columns and 24836 rows on DF1 dataset
---> 274 columns and 27023 rows on DF2 dataset


QC to the Phenotype

In [22]:
df_outcome_discovery = df_discovery[[outcome]]
df_outcome_replication = df_replication[[outcome]]

In [23]:
df = igem.epc.modify.colfilter_percent_zero(df_outcome_discovery, filter_percent=90)

Running colfilter_percent_zero
--------------------------------------------------------------------------------
Testing 1 of 1 continuous variables
	Removed 0 (0.00%) tested continuous variables which were equal to zero in at least 90.00% of non-NA observations.


In [24]:
skew_value = igem.epc.describe.skewness(
    df,
    dropna='True'
    ).loc[0, 'skew']
print(f'The skew ratio is: {skew_value}')

The skew ratio is: 0.9532952387225929


In [25]:
if abs(skew_value) > 0.5:
    group = 'discovery'
    df_log = igem.epc.modify.transform(df, 'log')
    
    # Run skewness again after log transf
    skew_value_log = igem.epc.describe.skewness(
        df_log,
        dropna='True'
    ).loc[0, 'skew']
    

Running transform
--------------------------------------------------------------------------------
Transformed 'LBXTC_N' using 'log'


In [26]:
df_outcome = df_log

QC to the Exposures

In [28]:
group = 'discovery'
plot=False
n_min=200
df = df_discovery

In [29]:
# Filtering the variant columns that exist in the discovery DataFrame
existing_variants = [var for var in list_variants if var in df.columns]
df_variants = df[existing_variants]
print(len(df_variants.columns))

268


In [30]:
# Count the number of object columns
objecy_columns = df_variants.select_dtypes(include=['object']).columns
categorical_columns = df_variants.select_dtypes(include=['category']).columns
continuous_columns = df_variants.select_dtypes(include=['number']).columns
print(f"---> Number of object columns:  {len(objecy_columns)}")
print(f"---> Number of categorical columns:  {len(categorical_columns)}")
print(f"---> Number of continuous columns:  {len(continuous_columns)}")

---> Number of object columns:  0
---> Number of categorical columns:  157
---> Number of continuous columns:  111


In [31]:
# Separate categorical and continuous variables
df_categorical = df_variants[categorical_columns]
df_continuou = df_variants[continuous_columns]
print(f"---> Number of categorical columns:  {len(df_categorical.columns)}")
print(f"---> Number of continuous columns:  {len(df_continuou.columns)}")

---> Number of categorical columns:  157
---> Number of continuous columns:  111


In [32]:
# the diff 157 - 133 (72 Bin + 61 Cat) = 24 are constants columns
igem.epc.describe.summarize(df_categorical)

24,836 observations of 157 variables
	72 Binary Variables
	61 Categorical Variables
	0 Continuous Variables
	0 Unknown-Type Variables



QC To Categorical Exposures

In [33]:
df_categorical = igem.epc.modify.colfilter_min_cat_n(df_categorical, n=100)
"""
For Instance, the field SMQ020 was dropped
1 - 10,855
2 - 11,692
3 - 3
9 - 18
Blank - 2,268
"""

Running colfilter_min_cat_n
--------------------------------------------------------------------------------
Testing 72 of 72 binary variables
	Removed 40 (55.56%) tested binary variables which had a category with less than 100 values.
Testing 61 of 61 categorical variables
	Removed 60 (98.36%) tested categorical variables which had a category with less than 100 values.


'\nFor Instance, the field SMQ020 was dropped\n1 - 10,855\n2 - 11,692\n3 - 3\n9 - 18\nBlank - 2,268\n'

In [34]:
df_categorical = igem.epc.modify.colfilter_min_n(df_categorical, n=200)

Running colfilter_min_n
--------------------------------------------------------------------------------
Testing 32 of 32 binary variables
	Removed 0 (0.00%) tested binary variables which had less than 200 non-null values.
Testing 1 of 1 categorical variables
	Removed 0 (0.00%) tested categorical variables which had less than 200 non-null values.
Testing 0 of 0 continuous variables


In [54]:
# Summary QC in Categorical Exposures
print(f"Started process with {len(categorical_columns)} columns\n")
print(f"After QC:")
igem.epc.describe.summarize(df_categorical)

Started process with 157 columns

After QC:
24,836 observations of 57 variables
	32 Binary Variables
	1 Categorical Variables
	0 Continuous Variables
	0 Unknown-Type Variables



In [46]:
# QC to binary fields with more than 2 values
categorical_columns = df_categorical.select_dtypes('category').columns

for col in categorical_columns:
    # get unique values
    # unique_values = df[col].unique()
    # unique_values = pd.Series(df[col].unique()).dropna().values
    unique_values = df[col].dropna().unique()

    # Check if some categorical is Binary
    if len(unique_values) > 2:
        print(f"Field: {col}")
        print(f"Values: {unique_values}\n")

    
#     # Verificar se a coluna contém os valores 1, 2, 7 e 9
#     if set([1, 2, 7, 9]).issubset(unique_values):
#         # Substituir valores 7 e 9 por np.nan
#         df[col] = df[col].replace({7: np.nan, 9: np.nan})


# # Aplicar a função ao DataFrame df_categorical
# df_categorical_cleaned = clean_categorical_columns(df_categorical)

# # Exibir o DataFrame limpo
# print(df_categorical_cleaned.head())


Field: SMD415A
Values: [2.0, 1.0, 3.0]
Categories (3, float64): [1.0, 2.0, 3.0]



  for val, m in zip(values.ravel(), mask.ravel())


QC to Continuous Exposures

In [57]:
df_continuous = igem.epc.modify.colfilter_percent_zero(df_continuou, filter_percent=90)

Running colfilter_percent_zero
--------------------------------------------------------------------------------
Testing 111 of 111 continuous variables
	Removed 0 (0.00%) tested continuous variables which were equal to zero in at least 90.00% of non-NA observations.


In [58]:
# Summary QC in Continuous Exposures
print(f"Started process with {len(continuous_columns)} columns\n")
print(f"After QC:")
igem.epc.describe.summarize(df_continuou)

Started process with 111 columns

After QC:
24,836 observations of 111 variables
	0 Binary Variables
	0 Categorical Variables
	111 Continuous Variables
	0 Unknown-Type Variables



Merge QC Exposures Datasets

In [59]:
# Combine filtered categorical and continuous columns
df_variants = pd.concat([df_categorical, df_continuous], axis=1)
igem.epc.describe.summarize(df_variants)

24,836 observations of 168 variables
	32 Binary Variables
	1 Categorical Variables
	111 Continuous Variables
	0 Unknown-Type Variables



QC to Covariantes

In [61]:
df_covariants = df_discovery[list_covariats]
igem.epc.describe.summarize(df_covariants)

24,836 observations of 5 variables
	1 Binary Variables
	2 Categorical Variables
	2 Continuous Variables
	0 Unknown-Type Variables



Merge all QC Data

In [68]:
# Create list of QC dataframes
dfs_to_merge = [
    df_outcome,
    df_covariants,
    df_variants
    ]
df_qc = reduce(merge_dfs, dfs_to_merge)
igem.epc.describe.summarize(df_qc)

24,836 observations of 174 variables
	33 Binary Variables
	3 Categorical Variables
	114 Continuous Variables
	0 Unknown-Type Variables



In [69]:
# Add group column
df_qc['group'] = 'discovery'
df_qc = igem.epc.modify.make_categorical(df_qc, only=["group",])

Running make_categorical
--------------------------------------------------------------------------------
Set 1 of 175 variable(s) as categorical, each with 24,836 observations
