# Phase 2: Create the Interaction Pairs

Documentation: https://halllab.atlassian.net/wiki/spaces/IGEM/pages/67862529/Phase+2+Create+the+Interaction+Pairs

After successfully loading all data from the 1999 to 2018 NHANES cycles into the MyNHANES system, we progress to the next phase which involves compiling a comprehensive list of fields related to exposure factors. We will utilize the detailed descriptions of these NHANES fields to operate the IGEM Search Engine, facilitating the retrieval of relevant TERMS and enabling thorough consultation of the existing relationships within our knowledge base.

In [22]:
# Importing the necessary libraries
import pandas as pd
from pathlib import Path
import glob
import os

In [2]:
# Defining the path to the data folder
path = Path().resolve()
path_data = path / 'data'

#### STEP 02_00: Extract Fields List from MyNHANES

no code

#### STEP 02_01: Search IGEM Terms from NHANES Fields Description

no code

#### STEP 02_02: Identify Exposure Factor Fields

no code

#### STEP 02_03: Generate Parameters File to setting the filter to GE.db database

no code

#### STEP 02_04: Obtaining the Terms of Relationship 

no code

#### STEP 02_05: Filter Interactions 

In [3]:
# Read data from previous steps
df_terms = pd.read_csv(
    str(path_data) + "/step_02_04_TermsRelationship.csv"
    )
ls_terms = pd.read_excel(
    (str(path_data) + "/step_02_02_Exposes_Identification.xlsx"),
    sheet_name="Exposes_Terms_Uniques",
    header=None,
    names=['term']
    )['term'].tolist()

In [4]:
# Filter the terms relationship
df_terms_filtered = df_terms[
    df_terms['term_1'].isin(ls_terms) & df_terms['term_2'].isin(ls_terms)
    ]
df_terms_filtered.drop(
    columns=[
        'datasource',
        'connector',
        'qtd_links'
        ],
    inplace=True
    )

df_terms_filtered.drop_duplicates(inplace=True)

# Save the data
df_terms_filtered.to_csv(
    str(path_data) + "/step_02_05_TermsRelationship_filtered.csv",
    index=False
)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


#### STEP 02_06: Link the IGEM Terms to NHANES Field ID

In [9]:
# Read NHANES data and IGEM terms
df_NHAMES_fields = pd.read_excel(
    str(path_data) + "/step_02_00_MyNHANES_fields_list.xlsx",
    sheet_name="fields_unique"
    )
df_NHANES_terms = pd.read_csv(
    str(path_data) + "/step_02_01_word_to_term.csv"
    )

In [10]:
# Normalization of string fields for comparison
df_NHAMES_fields['field_description'] = df_NHAMES_fields['field_description'].str.lower().str.strip()
df_NHANES_terms['string'] = df_NHANES_terms['string'].str.lower().str.strip()

# Concat the DataFrames based on the 'string' column of df_NHANES_terms and 'field_description' of df_NHANES_fields
df_NHANES_fields_terms = pd.merge(df_NHANES_terms, df_NHAMES_fields, left_on='string', right_on='field_description', how='left')

In [11]:
# Drop columns 'datasource', 'connector' and 'qtd_links'
df_NHANES_fields_terms.drop(columns=['row', 'string', 'word', 'term_id', 'term_descr', 'qtd_terms', 'qtd_loops', 'time'], inplace=True)

# Drop duplicates records
df_NHANES_fields_terms.drop_duplicates(inplace=True)
df_NHANES_fields_terms.dropna(subset=['term'], inplace=True)

In [14]:
# Clean terms interactions df column
df_interactions= df_terms_filtered.drop(columns=['term_group_1', 'term_category_1', 'word_1',  'description_1', 'term_group_2', 'term_category_2', 'word_2', 'description_2'])
print(df_interactions)

Unnamed: 0,term_1,term_2
25911,chem:d012906,dise:d013471
71399,chem:d000073893,go:0030431
76760,chem:d002241,go:0030431
94162,chem:d010710,go:0030431
102837,chem:d013256,go:0030431
...,...,...
219915,meta:hmdb0014344,meta:hmdb0302501
220415,meta:hmdb0015043,meta:hmdb0015517
220557,meta:hmdb0015043,meta:hmdb0302501
220980,meta:hmdb0015517,meta:hmdb0302501


In [15]:
# create a new DataFrame
new_rows = []

for _, model_row in df_interactions.iterrows():
    term_1 = model_row['term_1']
    term_2 = model_row['term_2']
    
    # Searching for matches for term_1
    term_1_matches = df_NHANES_fields_terms[df_NHANES_fields_terms['term'] == term_1]
    
    # Searching for matches for term_2
    term_2_matches = df_NHANES_fields_terms[df_NHANES_fields_terms['term'] == term_2]
    
    for _, term_1_row in term_1_matches.iterrows():
        for _, term_2_row in term_2_matches.iterrows():
            new_row = {
                'term_1': term_1,
                'field_name_1': term_1_row['field_name'],
                'field_description_1': term_1_row['field_description'],
                'term_2': term_2,
                'field_name_2': term_2_row['field_name'],
                'field_description_2': term_2_row['field_description']
            }
            new_rows.append(new_row)

df_models = pd.DataFrame(new_rows)

In [16]:
# save interactions nhanes exposes in the models file
df_models.to_csv(
    str(path_data) + "/step_02_06_Models.csv",
    index=False
    )
print(len(df_models))

#### STEP 02_07: Create a NHANES Fields List

In [20]:
# Isolate the fields in a unique list
# This list will used the select parameters in the NHANES API
df_fields_1 = df_models.drop(columns=['term_1', 'term_2', 'field_name_2', 'field_description_2'])
df_fields_2 = df_models.drop(columns=['term_1', 'term_2', 'field_name_1', 'field_description_1'])
df_fields_1.rename(columns={'field_name_1': 'field_name', 'field_description_1': 'field_description'}, inplace=True)
df_fields_2.rename(columns={'field_name_2': 'field_name', 'field_description_2': 'field_description'}, inplace=True)
df_fields = pd.concat([df_fields_1, df_fields_2], ignore_index=True)
df_fields.drop_duplicates(inplace=True)

In [31]:
# add the Phenotypes and Covariants fields
phen_cov = {
    'field_name': [
        'LBDHDL',
        'LBXHDD',
        'LBDHDD',
        'LBDLDL',
        'LBXTC',
        'LBXSTR',
        'RIAGENDR',
        'RIDAGEYR',
        'BMXBMI',
        'RIDRETH1'
    ],
    'field_description': [
        'HDL-cholesterol (mg/dL)',
        'Direct HDL-Cholesterol (mg/dL)',
        'Direct HDL-Cholesterol (mg/dL)',
        'LDL-cholesterol (mg/dL)',
        'Total Cholesterol (mg/dL)',
        'Triglycerides (mg/dL)',
        'Gender',
        'Age in years at screening',
        'Body Mass Index (kg/m**2)',
        'Race/Ethnicity - Recode',
    ]
}

new_rows = pd.DataFrame(phen_cov)

# Concatenate Phenotypes, Covariants and Exposures fields
df_fields = pd.concat([new_rows, df_fields], ignore_index=True)


In [34]:
# Save the fields file
df_fields.to_csv(
    str(path_data) + "/step_02_07_Fields.csv",
    index=False
    )