# Filtering Interactions for TCGA-BRCA Networks
TCGA: The Cancer Genome Atlas  
BRCA: Breast Invasive Carcinoma

# Importing Libraries and Configurations

In [1]:
import os
import sys

import numpy as np
import pandas as pd

# Add project root to Python's path
sys.path.append(os.path.abspath(os.path.join('..', '..')))

from config import (
    BRCA_INTERIM_FILES_DIRS,
    BRCA_PROCESSED_FILES_DIRS,
    CYTOSCAPE_PROCESSED_FILES_DIRS,
    INTERACTION_INFERENCE_PARAMETERS,
    INTERACTION_FILTERING_PARAMETERS,
)

# Function

In [2]:
def flag_inferred_interactions(dir_base_name):
    """
    Flag inferred interactions of interest based on correlation and q-value thresholds.

    Parameter:
    ----------
    dir_base_name : str
        A key used to retrieve the appropriate paths from the global dictionaries 
        `BRCA_INTERIM_FILES_DIRS` and `BRCA_PROCESSED_FILES_DIRS`.

    Returns:
    --------
    pd.DataFrame
        A DataFrame containing the inferred interactions with an additional column 
        'is_interaction_of_interest', where 1 indicates that the interaction meets the 
        specified criteria, and 0 otherwise.
    """
    # Define the group interim and processed directories path
    interim_dir_path = BRCA_INTERIM_FILES_DIRS[dir_base_name]
    processed_dir_path = BRCA_PROCESSED_FILES_DIRS[dir_base_name]
    
    # Create a DataFrame for the inferred interactions
    file_name = INTERACTION_INFERENCE_PARAMETERS['file-name']
    df_inferred_interactions = pd.read_csv(os.path.join(interim_dir_path, file_name))
    
    # Flag the inferred interactions of interest, i.e. those with correlation and 
    # q-value values below the thresholds
    min_correlation = INTERACTION_FILTERING_PARAMETERS['min_correlation']
    min_qvalue = INTERACTION_FILTERING_PARAMETERS['min_qvalue']
    df_inferred_interactions['is_interaction_of_interest'] = np.where(
        ((df_inferred_interactions['correlation'] < min_correlation)
         & (df_inferred_interactions['qvalue'] < min_qvalue)), 1, 0
    )
    
    # Store the DataFrame of flagged inferred interactions
    df_inferred_interactions.to_csv(
        os.path.join(processed_dir_path, file_name), index=False
    )
    
    return df_inferred_interactions

In [3]:
def filter_network_interactions(group):
    """
    Filter inferred microRNA-messenger RNA interactions based on interest criteria and 
    formats them for network analysis.

    Parameter:
    ----------
    group : str
        A string identifier for the sample group.
        This is used to locate the appropriate data directories.

    Returns:
    --------
    pd.DataFrame
        A filtered and formatted DataFrame of interactions where:
        - Column names are standardized as 'mirna', 'mrna', and 'interaction_id'.
        - Only interactions flagged as "of interest" are included.
        - Missing values in 'interaction_id' are replaced with an empty string.
    """
    # Define the group processed directory path
    dir_base_name = (group.lower()).replace(' ', '-')
    processed_dir_path = CYTOSCAPE_PROCESSED_FILES_DIRS[dir_base_name]
    
    # Flag the inferred interactions of interest
    df_inferred_interactions = flag_inferred_interactions(dir_base_name)
    
    # Filter the network interactions
    df_network_interactions = df_inferred_interactions \
        .query('is_interaction_of_interest == 1') \
        .drop(columns=['is_interaction_of_interest', 'accession_id']) \
        .reset_index(drop=True)
    
    # Rename some columns and fill NaN values
    df_network_interactions = df_network_interactions \
        .rename(columns={
            'mirna_name': 'mirna',
            'gene_name': 'mrna',
            'mirtarbase': 'interaction_id',
        }) \
        .fillna(value={'interaction_id': ''})
    
    # Store the DataFrame of flagged inferred interactions
    file_name = INTERACTION_FILTERING_PARAMETERS['interactions_file_name']
    df_network_interactions.to_csv(
        os.path.join(processed_dir_path, file_name), index=False
    )
    
    return df_network_interactions

# Interaction Filtering

## Basal-like

In [4]:
# Select the interactions that will be in the Basal-like tumor tissue network
filter_network_interactions('Basal-like')

Unnamed: 0,mirna,mrna,interaction_id,correlation,pvalue,qvalue
0,hsa-let-7c-5p,IGF1R,MIRT053087,-0.388423,0.000101,0.028263
1,hsa-let-7c-5p,EIF2S2,,-0.416928,0.000029,0.017575
2,hsa-let-7d-5p,ITSN1,,-0.378399,0.000151,0.033543
3,hsa-let-7e-5p,CD86,,-0.367774,0.000229,0.039746
4,hsa-let-7e-5p,CD200R1,,-0.372494,0.000191,0.037333
...,...,...,...,...,...,...
291,hsa-miR-497-5p,HMGA1,,-0.392998,0.000083,0.026621
292,hsa-miR-497-5p,EIF3M,,-0.401217,0.000059,0.023656
293,hsa-miR-33b-5p,TARDBP,,-0.400115,0.000062,0.023656
294,hsa-miR-330-5p,NGFR,,-0.375501,0.000169,0.035921


## HER2-enriched

In [5]:
# Select the interactions that will be in the HER2-enriched tumor tissue network
filter_network_interactions('HER2-enriched')

Unnamed: 0,mirna,mrna,interaction_id,correlation,pvalue,qvalue
0,hsa-let-7d-5p,HIF1AN,,-0.539303,8.986564e-06,0.041561
1,hsa-let-7d-5p,MEF2D,MIRT735654,-0.562406,3.225376e-06,0.032134
2,hsa-let-7d-5p,SMARCC1,,-0.546275,6.649086e-06,0.041507
3,hsa-let-7d-5p,PRKAR2A,,-0.536637,1.006593e-05,0.041561
4,hsa-miR-18a-5p,ZBTB20,,-0.558715,3.818749e-06,0.032134
5,hsa-miR-30c-5p,SNAI1,MIRT006762,-0.52905,1.382919e-05,0.043795
6,hsa-miR-182-5p,GPHN,,-0.531852,1.230922e-05,0.043795
7,hsa-miR-141-3p,IGF2,,-0.587491,9.681125e-07,0.032134
8,hsa-miR-155-5p,VAV3,,-0.536569,1.009512e-05,0.041561
9,hsa-miR-106b-5p,PLXDC2,,-0.558237,3.902691e-06,0.032134


## Luminal A

In [6]:
# Select the interactions that will be in the Luminal A tumor tissue network
filter_network_interactions('Luminal A')

Unnamed: 0,mirna,mrna,interaction_id,correlation,pvalue,qvalue
0,hsa-let-7c-5p,PARD6B,,-0.305317,1.699354e-06,0.000311
1,hsa-miR-16-5p,TLE4,MIRT031643,-0.315052,7.889337e-07,0.000169
2,hsa-miR-16-5p,PLSCR4,MIRT031585,-0.338303,1.127362e-07,0.000046
3,hsa-miR-16-5p,ETV1,,-0.325088,3.474575e-07,0.000097
4,hsa-miR-16-5p,ACVR2A,MIRT000536,-0.312958,9.326604e-07,0.000193
...,...,...,...,...,...,...
254,hsa-miR-33b-5p,CROT,,-0.302597,2.095452e-06,0.000356
255,hsa-miR-33b-5p,SLC16A7,,-0.323011,4.127512e-07,0.000109
256,hsa-miR-425-5p,PPP2CB,MIRT016657,-0.310217,1.158869e-06,0.000233
257,hsa-miR-744-5p,HPSE2,,-0.378697,2.572259e-09,0.000003


## Luminal B

In [7]:
# Select the interactions that will be in the Luminal B tumor tissue network
filter_network_interactions('Luminal B')

Unnamed: 0,mirna,mrna,interaction_id,correlation,pvalue,qvalue
0,hsa-let-7b-5p,SENP5,,-0.305959,0.000339,0.027519
1,hsa-let-7b-5p,PEG10,MIRT561775,-0.351302,0.000042,0.010108
2,hsa-let-7d-5p,PDE12,MIRT120920,-0.301715,0.000406,0.029633
3,hsa-let-7d-5p,CALU,MIRT123316,-0.300611,0.000425,0.030313
4,hsa-let-7d-5p,CSRNP3,,-0.316057,0.000219,0.022328
...,...,...,...,...,...,...
579,hsa-miR-378c,TOLLIP,,-0.317707,0.000203,0.021596
580,hsa-miR-378c,MEF2D,,-0.330541,0.000113,0.016482
581,hsa-miR-378c,ABI2,,-0.312810,0.000252,0.024002
582,hsa-miR-374c-5p,ATXN1,,-0.304957,0.000354,0.027953


## Paired Normal

In [8]:
# Select the interactions that will be in the Paired Normal tissue network
filter_network_interactions('Paired Normal')

Unnamed: 0,mirna,mrna,interaction_id,correlation,pvalue,qvalue
0,hsa-let-7a-5p,GIPC1,,-0.427751,5.038145e-04,0.008435
1,hsa-let-7a-5p,MMP11,,-0.339986,5.178013e-03,0.044181
2,hsa-let-7b-5p,GJC1,,-0.598291,5.586415e-07,0.000054
3,hsa-let-7b-5p,MEF2C,MIRT052262,-0.340807,5.080635e-03,0.043603
4,hsa-let-7b-5p,SOX6,,-0.426589,5.218179e-04,0.008635
...,...,...,...,...,...,...
5067,hsa-miR-370-5p,KCNB1,,-0.355776,3.561907e-03,0.033726
5068,hsa-miR-370-5p,ANTXR2,,-0.337184,5.522999e-03,0.046262
5069,hsa-miR-370-5p,PIP4P2,,-0.334586,5.860281e-03,0.048118
5070,hsa-miR-370-5p,ATP1A2,,-0.339986,5.178013e-03,0.044181
