# Phase 6: Run Interactions

Documentation: 

In [45]:
import pandas as pd
from pathlib import Path
import igem

In [46]:
# define the path to the data folder
path = Path().resolve()
path_data = path / "data"

#### STEP 06_00: Processing identified interaction data (GE.db)

In [47]:
# Read Moldel (we need to clean interactions that is not in coluns list)
df_model = igem.epc.load.from_csv(str(path_data) + "/step_02_06_Models.csv") 
print(f"Start with: {len(df_model)} interactions")

Loaded 401,736 observations of 5 variables
Start with: 401736 interactions


In [48]:
# Keep only interactions terms
df_model = igem.epc.modify.colfilter(
    df_model,
    only=['field_name_1', 'field_name_2']
    )

Running colfilter
--------------------------------------------------------------------------------
Keeping 2 of 5 variables:
	0 of 0 binary variables
	0 of 0 categorical variables
	0 of 0 continuous variables
	2 of 5 unknown variables


#### STEP 06_01: Process Interactions

In [49]:
# Function to check if columns exist in DataFrame
def columns_exist(df, cols):
        return all(col in df.columns for col in cols)

In [51]:
# Define the groups
list_covariates = ['RIDAGEYR', 'RIAGENDR', 'RIDRETH1', 'BMXBMI', 'Cycle']
list_outcomes = ['LBDLDL_N', 'LBXTC_N', 'LBXSTR', 'LBDHDL', 'LBXHDD', 'LBDHDD']

In [52]:
# DataFrame to collect results
df_results_discover_final = pd.DataFrame()
df_results_replicate_final = pd.DataFrame()
list_results_discover = []
list_results_replicate = []

In [None]:
# Loop to process each Outcome
results = []
list_outcomes = ['LBDLDL_N'] # DEBUG ONLY
# list_outcomes = ['LBDLDL_N', 'LBXTC_N', 'LBXSTR'] # DEBUG ONLY
for outcome in list_outcomes:

    # Set
    count_lt_threshold = 0
    count_no_columns = 0
    count_same_e = 0

    # Read QC Dataset
    file_name = str(path_data) + "/step_05_05/QC_NHANES_" + outcome + ".pkl"
    df_nhanes = pd.read_pickle(file_name)

    # check columns types as Object / Unknowm-Type
    igem.epc.describe.summarize(df_nhanes)

    # Define the list of exposes
    excluded_columns = set(list_covariates + list_outcomes)
    list_exposes = [col for col in df_nhanes.columns if col not in excluded_columns]

    # Keep only interactions that are in the columns of df_nhanes
    # Create a set of df_nhanes columns for quick checking
    nhanes_columns = set(df_nhanes.columns)
    # filter df_models to keep only interactions where both terms are in df_nhanes columns
    df_models_filtered = df_model[df_model.apply(lambda row: row['field_name_1'] in nhanes_columns and row['field_name_2'] in nhanes_columns, axis=1)]
    df_models_filtered.reset_index(drop=True, inplace=True)
    print(f"Total Interactions = {len(df_model)} / After sync = {len(df_models_filtered)}")

    # Split Dataset
    df_discovery = df_nhanes[df_nhanes['group'] == 'discovery']
    df_replicate = df_nhanes[df_nhanes['group'] == 'replication']


    for i_mappair in df_models_filtered.index:
        # get Exposomes
        e1 = df_models_filtered["field_name_1"][i_mappair]
        e2 = df_models_filtered["field_name_2"][i_mappair]

        # RUN QC over Exposures Subset
        # Check if columns exist in the DataFrame
        if columns_exist(df_discovery, [e1, e2]):
            # create a DataFrame with the columns of interest
            v_list = list_covariates + [outcome, e1, e2]
            df_exe = df_discovery.loc[:, v_list]
            print(f"Processed with: {e1} and {e2}")
        else:
            print(f"Skipped: {e1} and/or {e2} not found")
            continue

        # Drop interactions if Object Type
        object_columns = df_exe.select_dtypes(include=['object']).columns
        df_exe = df_exe.drop(columns=object_columns)

        # Drop all row with any NAN
        df_exe = df_exe.dropna()

        # Drop Constants Columns
        non_constant_columns = df_exe.columns[df_exe.nunique() > 1]
        df_exe = df_exe[non_constant_columns]

        if len(df_exe) < 200:
            count_lt_threshold += 1
            continue
        if not columns_exist(df_exe, v_list):
            count_no_columns += 1
            continue
        if e1 == e2:
            count_same_e += 1
            continue
        # -- End of QC

        # Run the interaction study
        Interation_Study = igem.epc.analyze.interaction_study(
            data=df_exe,
            outcomes=outcome,
            interactions=[(e1, e2)],
            covariates=list_covariates,
            min_n=200,
        )

        # Keep the results values
        list_results_discover.append(
            [
                Interation_Study.LRT_pvalue.index.levels[2][0],
                Interation_Study.LRT_pvalue.index.levels[0][0],
                Interation_Study.LRT_pvalue.index.levels[1][0],
                Interation_Study.Converged.values[0],
                Interation_Study.LRT_pvalue.values[0],
                Interation_Study.LRT_pvalue.values[0] * len(df_exe),
            ]
        )

    # Create a DataFrame with the results
    df_results_discover = pd.DataFrame(
        list_results_discover,
        columns=[
            "Outcome", "Term1", "Term2", "Converged", "pvalue", "Bonfp"
            ],
    )

    igem.epc.analyze.add_corrected_pvalues(df_results_discover)

    # Get a dictionary of phenotype : list of significant variables
    df_results_discover_sig = df_results_discover[
        df_results_discover['pvalue_fdr'] < 0.1
        ]
    
    df_results_discover_sig.to_csv('test_process.csv')


---------------------