In [2]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import igem

rpy2 ModuleSpec(name='rpy2', loader=<_frozen_importlib_external.SourceFileLoader object at 0x352e45ff0>, origin='/Users/andrerico/Works/Projects/pbs_igem/.venv/lib/python3.10/site-packages/rpy2/__init__.py', submodule_search_locations=['/Users/andrerico/Works/Projects/pbs_igem/.venv/lib/python3.10/site-packages/rpy2'])


In [3]:
# define the path to the data folder
path = Path().resolve()
path_data = path / "data"

In [6]:
# Read NHANES Data (with medications normalized to LDL and TC)
df_nhanes = igem.epc.load.from_csv(
    str(path_data) +  "/step_04_02_nhanes_data_with_medications.csv",
    index_col='ID'
    )
df_nhanes.index = df_nhanes.index.astype(int)
print(f"Start Run Interactions Process with: {len(df_nhanes)} records")

Loaded 55,206 observations of 918 variables
Start Run Interactions Process with: 55206 records


  return clarite.load.from_csv(filename, index_col, **kwargs)


In [97]:
# Read Moldel (we need to clean interactions that is not in coluns list)
df_model = igem.epc.load.from_csv(str(path_data) + "/step_01_05_Models.csv") 
print(f"Start with: {len(df_model)} interactions")

Loaded 992,923 observations of 5 variables
Start with: 992923 interactions


In [98]:
# Keep only interactions terms
df_model = igem.epc.modify.colfilter(
    df_model,
    only=['field_name_1', 'field_name_2']
    )

Running colfilter
--------------------------------------------------------------------------------
Keeping 2 of 5 variables:
	0 of 0 binary variables
	0 of 0 categorical variables
	0 of 0 continuous variables
	2 of 5 unknown variables


In [99]:
# Keep only interactions that are in the columns of df_nhanes
# Create a set of df_nhanes columns for quick checking
nhanes_columns = set(df_nhanes.columns)
# filter df_models to keep only interactions where both terms are in df_nhanes columns
df_models_filtered = df_model[df_model.apply(lambda row: row['field_name_1'] in nhanes_columns and row['field_name_2'] in nhanes_columns, axis=1)]
print(f"Now has {len(df_models_filtered)} interactions after filtering")
df_models_filtered.reset_index(drop=True, inplace=True)

Now has 213059 interactions after filtering


In [100]:
df_models_filtered

Unnamed: 0,field_name_1,field_name_2
0,LBDLG1LC,ARQ077
1,LBDLG1LC,ARQ034D
2,LBXLG1,ARQ077
3,LBXLG1,ARQ034D
4,SSLG1_N,ARQ077
...,...,...
213054,LBDSZNSI,SMD830
213055,LBDSZNSI,SMD770
213056,LBDSZNSI,SMD800
213057,LBDSZNSI,SMD740


## Split Cohorts

good = HDL - Have 3 diff types of metrics ()

bad = LDL, Total-C and Triglycerides split in Discovery (1999-2008) and Replicate (2009-2018)

In [101]:
# Slipt the data into discovery and replicate in bad group 
cycles_discovery = ['1999-2000', '2001-2002', '2003-2004', '2005-2006', '2007-2008']
cycles_replicate = ['2009-2010', '2011-2012', '2013-2014', '2015-2016', '2017-2018']

df_nhanes_discovery = df_nhanes[df_nhanes['Cycle'].isin(cycles_discovery)]
df_nhanes_replicate = df_nhanes[df_nhanes['Cycle'].isin(cycles_replicate)]

df_nhanes_discovery['Cycle'] = pd.Categorical(df_nhanes_discovery['Cycle'], categories=cycles_discovery, ordered=True)
df_nhanes_replicate['Cycle'] = pd.Categorical(df_nhanes_replicate['Cycle'], categories=cycles_replicate, ordered=True)

Setting Interactions Variables

In [102]:
list_covariates = ['RIDAGEYR', 'RIAGENDR', 'RIDRETH1', 'BMXBMI', 'Cycle']
list_bad_phenotypes = ['LBDLDL', 'LBXTC', 'LBXSTR']
list_good_phenotypes = ['LBDHDL', 'LBXHDD', 'LBDHDD']

excluded_columns = set(list_covariates + list_bad_phenotypes + list_good_phenotypes)
list_exposes = [col for col in df_nhanes.columns if col not in excluded_columns]


## IMPORTANT:

#### before run, we need to ajust cholesterol by who use some medication to control.
#### How to check if both group has the same fields

In [103]:
# Função para verificar se as colunas existem no DataFrame
def columns_exist(df, cols):
    return all(col in df.columns for col in cols)

In [104]:
# DataFrame to collect results
df_results_discover_final = pd.DataFrame()
df_results_replicate_final = pd.DataFrame()
list_results_discover = []
list_results_replicate = []

#### We will only run it for the LDL phenotype to test the script's integrity.

Important: we still need to define the rationale for adjusting participants who use medications to control cholesterol.

- Nikki and I are evaluating the use of stalin.

In [120]:
"""
Defines Discovery and Replicate groups based on cycles.
Aligns both groups to have the same exposure factors.

Important: In production, this block is inside the phenotype loop.
"""

list_bad_phenotypes = ['LBDLDL',]
for i_outcome in list_bad_phenotypes:
    print(f"Start with: {i_outcome}")

    # Filter the DataFrames to keep only the columns of interest
    df_nhanes_discovery_exe = df_nhanes_discovery[[i_outcome] + list_covariates + list_exposes].dropna(subset=[i_outcome])
    df_nhanes_replicate_exe = df_nhanes_replicate[[i_outcome] + list_covariates + list_exposes].dropna(subset=[i_outcome])

    # Sync both DataFrames to have the same Exposure columns
    # Drop columns with all NaN values in both DataFrames
    df_nhanes_discovery_exe = df_nhanes_discovery_exe.dropna(axis=1, how='all')
    df_nhanes_replicate_exe = df_nhanes_replicate_exe.dropna(axis=1, how='all')
    # get the common columns
    common_columns = df_nhanes_discovery_exe.columns.intersection(df_nhanes_replicate_exe.columns)
    # filter both DataFrames to keep only the common columns
    df_nhanes_discovery_exe = df_nhanes_discovery_exe[common_columns]
    df_nhanes_replicate_exe = df_nhanes_replicate_exe[common_columns]
    # check if both groups as the same number of columns
    n_discovery_exe = len(df_nhanes_discovery_exe.columns)
    n_replicate_exe = len(df_nhanes_replicate_exe.columns)
    # Raise an error if the number of columns is different
    if n_discovery_exe != n_replicate_exe:
        print(f"Discovery has {n_discovery_exe} columns and Replicate has {n_replicate_exe} columns")
        print("Columns in Discovery but not in Replicate:")
        print(set(df_nhanes_discovery_exe.columns) - set(df_nhanes_replicate_exe.columns))
        print("Columns in Replicate but not in Discovery:")
        print(set(df_nhanes_replicate_exe.columns) - set(df_nhanes_discovery_exe.columns))
        print("Skipping this outcome")
        continue

Start with: LBDLDL


In [121]:
"""
Performs the categorization of the analysis components. For constant columns, we will eliminate them.
"""

def categorize_columns(df):
    continuous = []
    categorical = []
    binary = []
    columns_to_drop = []
    
    for col in df.columns:
        unique_values = df[col].dropna().unique()
        num_unique_values = len(unique_values)
        
        if num_unique_values == 1:
            # add the column to the list of columns to remove
            columns_to_drop.append(col)

        elif pd.api.types.is_numeric_dtype(df[col]):
            if num_unique_values == 2:
                # binary.append(col)
                continuous.append(col)
            else:
                continuous.append(col)
        else:
            if num_unique_values == 2:
                binary.append(col)
            else:
                categorical.append(col)
    
    return continuous, categorical, binary, columns_to_drop

# run the function to categorize the columns
continuous_cols, categorical_cols, binary_cols, columns_to_drop = categorize_columns(df_nhanes_discovery_exe)

# show the results
print(f"{len(continuous_cols)} are continuous columns: {continuous_cols}")
print(f"{len(categorical_cols)} are categorical columns: {categorical_cols}")
print(f"{len(binary_cols)} are binary columns: {binary_cols}")
print(f"{len(columns_to_drop)} are columns to drop with constant value: {columns_to_drop}")

# remove the columns with only one unique value
df_nhanes_discovery_exe.drop(columns=columns_to_drop, inplace=True)

df_nhanes_discovery_exe = igem.epc.modify.make_continuous(
    df_nhanes_discovery_exe,
    only=continuous_cols
)

df_nhanes_discovery_exe = igem.epc.modify.make_categorical(
    df_nhanes_discovery_exe,
    only=categorical_cols
)

df_nhanes_discovery_exe = igem.epc.modify.make_categorical(
    df_nhanes_discovery_exe,
    only=binary_cols
)

# Manually set columns
df_nhanes_discovery_exe = igem.epc.modify.make_categorical(
    df_nhanes_discovery_exe,
    only=["RIDRETH1", "Cycle"]
    )

191 are continuous columns: ['LBDLDL', 'RIDAGEYR', 'RIDRETH1', 'BMXBMI', 'ALQ140Q', 'ALQ150', 'AUQ231', 'CBD620', 'CBQ050', 'DBQ197', 'DBQ229', 'DBQ235A', 'DBQ235B', 'DBQ235C', 'DUQ250', 'DUQ260', 'DUQ270Q', 'DUQ272', 'DUQ290', 'DUQ300', 'DUQ310Q', 'DUQ330', 'DUQ340', 'DUQ350Q', 'DUQ352', 'DUQ380A', 'ENQ090', 'GTDSCMMN', 'GTXDRANK', 'HOQ070', 'HOQ080', 'HSQ590', 'LBDFOL', 'LBDIHGSI', 'LBDRBF', 'LBDV4CLC', 'LBDVBFLC', 'LBDVCFLC', 'LBDVDBLC', 'LBDVEBLC', 'LBDVMELC', 'LBDVOXLC', 'LBDVSTLC', 'LBDVTCLC', 'LBDVTOLC', 'LBDVXYLC', 'LBDWBFLC', 'LBDWCFLC', 'LBX2DF', 'LBXBCD', 'LBXBPB', 'LBXCOT', 'LBXIHG', 'LBXNM', 'LBXPFBS', 'LBXPFDE', 'LBXPFDO', 'LBXPFOA', 'LBXPFOS', 'LBXPFSA', 'LBXPFUA', 'LBXPLP', 'LBXRBFSI', 'LBXSF2SI', 'LBXTHG', 'LBXV1D', 'LBXV2A', 'LBXV2P', 'LBXV2T', 'LBXV3B', 'LBXV4C', 'LBXVBF', 'LBXVBM', 'LBXVBZ', 'LBXVCF', 'LBXVCM', 'LBXVCT', 'LBXVDB', 'LBXVDM', 'LBXVDP', 'LBXVEB', 'LBXVFN', 'LBXVIPB', 'LBXVMC', 'LBXVME', 'LBXVNB', 'LBXVOX', 'LBXVST', 'LBXVTC', 'LBXVTE', 'LBXVTO', 'LBXVX

Debug propose: Select two expose factores. 

In [125]:
e1 = "ALQ140Q"
e2 = "ALQ150"

# Check if columns exist in the DataFrame
if columns_exist(df_nhanes_discovery_exe, [e1, e2]):
    # create a DataFrame with the columns of interest
    df_maintable_exe = df_nhanes_discovery_exe.loc[:, list_covariates + [i_outcome, e1, e2]]
    print(f"Processed with: {e1} and {e2}")
else:
    print(f"Skipped: {e1} and/or {e2} not found")

print(len(df_maintable_exe))
print(e1, " - ", e2)
print(i_outcome)
print(list_covariates)
print(df_maintable_exe.dtypes)
print(df_maintable_exe.head())

Processed with: ALQ140Q and ALQ150
11453
ALQ140Q  -  ALQ150
LBDLDL
['RIDAGEYR', 'RIAGENDR', 'RIDRETH1', 'BMXBMI', 'Cycle']
RIDAGEYR     float64
RIAGENDR    category
RIDRETH1    category
BMXBMI       float64
Cycle       category
LBDLDL       float64
ALQ140Q      float64
ALQ150       float64
dtype: object
         RIDAGEYR RIAGENDR RIDRETH1  BMXBMI      Cycle  LBDLDL  ALQ140Q  \
ID                                                                        
41479.0      52.0      1.0      1.0   27.56  2007-2008   121.0      3.0   
41485.0      30.0      2.0      2.0   25.99  2007-2008   119.0      NaN   
41486.0      61.0      2.0      1.0   31.21  2007-2008   110.0      NaN   
41487.0      27.0      1.0      5.0   23.44  2007-2008   105.0      0.0   
41489.0      40.0      2.0      1.0   36.59  2007-2008   106.0      3.0   

         ALQ150  
ID               
41479.0     2.0  
41485.0     NaN  
41486.0     NaN  
41487.0     2.0  
41489.0     2.0  


In [119]:
Interation_Study = igem.epc.analyze.interaction_study(
        data=df_maintable_exe,
        outcomes=i_outcome,
        interactions=[(e1, e2)],
        covariates=list_covariates,
    )


InteractionRegression
-------------------------
Continuous Outcome (family = Gaussian): 'LBDLDL'
Using 11,453 of 11,453 observations
Regressing 2 variables
	0 binary variables
	0 categorical variables
	2 continuous variables
	0 genotypes variables
Processing 1 interactions
-------------------------
[32mRunning 1 interactions using 14 processes...[0m

[32m	Finished Running 1 interactions[0m
[32m0 tests had an error[0m
Completed Interaction Study for LBDLDL

Completed association study


### Create the loop to process all the data.

In [None]:
# for terms in df_models_filtered.itertuples():

# e1 = terms.field_name_1
# e2 = terms.field_name_2
# print(f"Start with: {e1} and {e2}")

e1 = "URDUP8LC"
e2 = "OSQ160B"

# Verifica se as colunas e1 e e2 existem no DataFrame
if columns_exist(df_nhanes_discovery_exe, [e1, e2]):
    # Cria o DataFrame df_maintable_exe apenas se ambas as colunas existirem
    df_maintable_exe = df_nhanes_discovery_exe.loc[:, list_covariates + [i_outcome, e1, e2]]
    # Faça o processamento necessário com df_maintable_exe
    print(f"Processed with: {e1} and {e2}")
else:
    print(f"Skipped: {e1} and/or {e2} not found")
    ...

print(len(df_maintable_exe))

# # Run Interation Study
# Interation_Study = igem.epc.analyze.interaction_study(
#     data=df_maintable_exe,
#     outcomes=i_outcome,
#     interactions=[(e1, e2)],
#     covariates=list_covariates,
# )
Interation_Study = igem.analyze.interaction_study(
        data=df_maintable_exe,
        outcomes=i_outcome,
        interactions=[(e1, e2)],
        covariates=list_covariates,
    )



#     # Save results in list: outcome/e1/e2/converged/LRT_pvalue/Bonfp
#     list_results_discover.append(
#         [
#             Interation_Study.LRT_pvalue.index.levels[2][0],
#             Interation_Study.LRT_pvalue.index.levels[0][0],
#             Interation_Study.LRT_pvalue.index.levels[1][0],
#             Interation_Study.Converged.values[0],
#             Interation_Study.LRT_pvalue.values[0],
#             Interation_Study.LRT_pvalue.values[0] * len(df_maintable_exe),
#         ]
#     )

# # Create a DataFrame with the results
# df_results_discover = pd.DataFrame(
#     list_results_discover,
#     columns=[
#         "Outcome", "Term1", "Term2", "Converged", "LRT_pvalue", "Bonfp"
#         ],
# )

ATORVASTATIN_CALCIUM", "SIMVASTATIN", "PRAVASTATIN_SODIUM", "FLUVASTATIN_SODIUM"