# **Construction of MicroRNA Networks Based on Inferred Interactions for TCGA-BRCA**

This notebook produces the AT_MN family of data artifacts 

- TCGA: The Cancer Genome Atlas  
- BRCA: Breast Invasive Carcinoma

# Import Libraries and Configurations

In [1]:
import os
import sys
from collections import defaultdict
from itertools import combinations

import numpy as np
import pandas as pd

# Add project root to Python's path
sys.path.append(os.path.abspath(os.path.join('..')))

from config import (
    ASSOCIATION_FILTERING_PARAMETERS,
    BRCA_INTERIM_FILES_DIRS,
    BRCA_PROCESSED_FILES_DIRS,
    CYTOSCAPE_PROCESSED_FILES_DIRS,
    INTERACTION_INFERENCE_PARAMETERS,
    INTERACTION_FILTERING_PARAMETERS,
)

# Interaction Networks

## Functions

In [2]:
def flag_inferred_interactions(dir_base_name):
    """
    Flag inferred interactions of interest based on correlation and q-value thresholds.

    Parameter:
    ----------
    dir_base_name : str
        A key used to retrieve the appropriate paths from the global dictionaries 
        `BRCA_INTERIM_FILES_DIRS` and `BRCA_PROCESSED_FILES_DIRS`.

    Returns:
    --------
    pd.DataFrame
        A DataFrame containing the inferred interactions with an additional column 
        'is_interaction_of_interest', where 1 indicates that the interaction meets the 
        specified criteria, and 0 otherwise.
    """
    # Define the group interim and processed directories path
    interim_dir_path = BRCA_INTERIM_FILES_DIRS[dir_base_name]
    processed_dir_path = BRCA_PROCESSED_FILES_DIRS[dir_base_name]
    
    # Create a DataFrame for the inferred interactions
    file_name = INTERACTION_INFERENCE_PARAMETERS['file-name']
    df_inferred_interactions = pd.read_csv(os.path.join(interim_dir_path, file_name))
    
    # Flag the inferred interactions of interest, i.e. those with correlation and 
    # q-value values below the thresholds
    min_correlation = INTERACTION_FILTERING_PARAMETERS['min_correlation']
    min_qvalue = INTERACTION_FILTERING_PARAMETERS['min_qvalue']
    df_inferred_interactions['is_interaction_of_interest'] = np.where(
        ((df_inferred_interactions['correlation'] < min_correlation)
         & (df_inferred_interactions['qvalue'] < min_qvalue)), 1, 0
    )
    
    # Store the DataFrame of flagged inferred interactions
    df_inferred_interactions.to_csv(
        os.path.join(processed_dir_path, file_name), index=False
    )
    
    return df_inferred_interactions

In [3]:
def create_network_files(group):
    """
    Generate and save network data files (edges and nodes) for a specified experimental group.

    Parameters:
    -----------
    group : str
        The name of the group for which the network files are to be created.
    
    Returns:
    --------
    dict of pd.DataFrame
        A dictionary containing:
        - 'edges': DataFrame representing the filtered interactions (edges) between miRNAs and mRNAs.
        - 'nodes': DataFrame containing node information (miRNAs and mRNAs) with assigned IDs and types.
    """
    # Define the group processed directory path
    dir_base_name = (group.lower()).replace(' ', '-')
    processed_dir_path = CYTOSCAPE_PROCESSED_FILES_DIRS[dir_base_name]
    
    # Flag the inferred interactions of interest
    df_inferred_interactions = flag_inferred_interactions(dir_base_name)
    
    # Create the edges DataFrame
    df_edges = df_inferred_interactions \
        .query('is_interaction_of_interest == 1') \
        .drop(columns=[
            'is_interaction_of_interest', 'accession_id', 'pvalue'
        ]) \
        .rename(columns={
            'mirna_name': 'source',
            'gene_name': 'target',
        }) \
        .fillna(value={'mirtarbase': ''}) \
        .reset_index(drop=True)
    
    # Create the microRNA nodes DataFrame
    df_mir_nodes = df_edges[['source']].copy() \
        .rename(columns={'source': 'label'}) \
        .drop_duplicates(ignore_index=True)
    df_mir_nodes['id'] = [f'mir{i+1}' for i in range(len(df_mir_nodes))]
    df_mir_nodes['type'] = 'microRNA'
    
    # Create the messenger RNA nodes DataFrame
    df_rna_nodes = df_edges[['target']].copy() \
        .rename(columns={'target': 'label'}) \
        .drop_duplicates(ignore_index=True)
    df_rna_nodes['id'] = [f'gene{i+1}' for i in range(len(df_rna_nodes))]
    df_rna_nodes['type'] = 'messenger RNA'
    
    # Create the nodes DataFrame
    df_nodes = pd.concat([df_mir_nodes, df_rna_nodes], ignore_index=True) \
        [['id', 'label', 'type']]
    
    # Store the DataFrames of edges
    edges_file_name = INTERACTION_FILTERING_PARAMETERS['edges_file_name']
    df_edges.to_csv(
        os.path.join(processed_dir_path, edges_file_name), index=False
    )
    
    # Store the DataFrames of nodes
    nodes_file_name = INTERACTION_FILTERING_PARAMETERS['nodes_file_name']
    df_nodes.to_csv(
        os.path.join(processed_dir_path, nodes_file_name), index=False
    )
    
    return {
        'edges': df_edges,
        'nodes': df_nodes,
    }

## Basal-like Files

In [4]:
# Create the files for the construction of the Basal-like tumor tissue network
files = create_network_files('Basal-like')

In [5]:
# Print the DataFrame of edges for the group network
files['edges']

Unnamed: 0,source,target,mirtarbase,correlation,qvalue
0,hsa-let-7c-5p,IGF1R,MIRT053087,-0.388423,0.028263
1,hsa-let-7c-5p,EIF2S2,,-0.416928,0.017575
2,hsa-let-7d-5p,ITSN1,,-0.378399,0.033543
3,hsa-let-7e-5p,CD86,,-0.367774,0.039746
4,hsa-let-7e-5p,CD200R1,,-0.372494,0.037333
...,...,...,...,...,...
291,hsa-miR-497-5p,HMGA1,,-0.392998,0.026621
292,hsa-miR-497-5p,EIF3M,,-0.401217,0.023656
293,hsa-miR-33b-5p,TARDBP,,-0.400115,0.023656
294,hsa-miR-330-5p,NGFR,,-0.375501,0.035921


In [6]:
# Print the DataFrame of nodes for the group network
files['nodes']

Unnamed: 0,id,label,type
0,mir1,hsa-let-7c-5p,microRNA
1,mir2,hsa-let-7d-5p,microRNA
2,mir3,hsa-let-7e-5p,microRNA
3,mir4,hsa-miR-17-5p,microRNA
4,mir5,hsa-miR-18a-5p,microRNA
...,...,...,...
286,gene229,HMGA1,messenger RNA
287,gene230,EIF3M,messenger RNA
288,gene231,TARDBP,messenger RNA
289,gene232,NGFR,messenger RNA


## HER2-enriched Files

In [7]:
# Create the files for the construction of the HER2-enriched tumor tissue network
files = create_network_files('HER2-enriched')

In [8]:
# Print the DataFrame of edges for the group network
files['edges']

Unnamed: 0,source,target,mirtarbase,correlation,qvalue
0,hsa-let-7d-5p,HIF1AN,,-0.539303,0.041561
1,hsa-let-7d-5p,MEF2D,MIRT735654,-0.562406,0.032134
2,hsa-let-7d-5p,SMARCC1,,-0.546275,0.041507
3,hsa-let-7d-5p,PRKAR2A,,-0.536637,0.041561
4,hsa-miR-18a-5p,ZBTB20,,-0.558715,0.032134
5,hsa-miR-30c-5p,SNAI1,MIRT006762,-0.52905,0.043795
6,hsa-miR-182-5p,GPHN,,-0.531852,0.043795
7,hsa-miR-141-3p,IGF2,,-0.587491,0.032134
8,hsa-miR-155-5p,VAV3,,-0.536569,0.041561
9,hsa-miR-106b-5p,PLXDC2,,-0.558237,0.032134


In [9]:
# Print the DataFrame of nodes for the group network
files['nodes']

Unnamed: 0,id,label,type
0,mir1,hsa-let-7d-5p,microRNA
1,mir2,hsa-miR-18a-5p,microRNA
2,mir3,hsa-miR-30c-5p,microRNA
3,mir4,hsa-miR-182-5p,microRNA
4,mir5,hsa-miR-141-3p,microRNA
5,mir6,hsa-miR-155-5p,microRNA
6,mir7,hsa-miR-106b-5p,microRNA
7,mir8,hsa-miR-378a-3p,microRNA
8,mir9,hsa-miR-320b,microRNA
9,gene1,HIF1AN,messenger RNA


## Luminal A Files

In [10]:
# Create the files for the construction of the Luminal A tumor tissue network
files = create_network_files('Luminal A')

In [11]:
# Print the DataFrame of edges for the group network
files['edges']

Unnamed: 0,source,target,mirtarbase,correlation,qvalue
0,hsa-let-7c-5p,PARD6B,,-0.305317,0.000311
1,hsa-miR-16-5p,TLE4,MIRT031643,-0.315052,0.000169
2,hsa-miR-16-5p,PLSCR4,MIRT031585,-0.338303,0.000046
3,hsa-miR-16-5p,ETV1,,-0.325088,0.000097
4,hsa-miR-16-5p,ACVR2A,MIRT000536,-0.312958,0.000193
...,...,...,...,...,...
254,hsa-miR-33b-5p,CROT,,-0.302597,0.000356
255,hsa-miR-33b-5p,SLC16A7,,-0.323011,0.000109
256,hsa-miR-425-5p,PPP2CB,MIRT016657,-0.310217,0.000233
257,hsa-miR-744-5p,HPSE2,,-0.378697,0.000003


In [12]:
# Print the DataFrame of nodes for the group network
files['nodes']

Unnamed: 0,id,label,type
0,mir1,hsa-let-7c-5p,microRNA
1,mir2,hsa-miR-16-5p,microRNA
2,mir3,hsa-miR-17-5p,microRNA
3,mir4,hsa-miR-18a-5p,microRNA
4,mir5,hsa-miR-20a-5p,microRNA
...,...,...,...
246,gene203,KIT,messenger RNA
247,gene204,CASP10,messenger RNA
248,gene205,TSHZ2,messenger RNA
249,gene206,PPP2CB,messenger RNA


## Luminal B Files

In [13]:
# Create the files for the construction of the Luminal B tumor tissue network
files = create_network_files('Luminal B')

In [14]:
# Print the DataFrame of edges for the group network
files['edges']

Unnamed: 0,source,target,mirtarbase,correlation,qvalue
0,hsa-let-7b-5p,SENP5,,-0.305959,0.027519
1,hsa-let-7b-5p,PEG10,MIRT561775,-0.351302,0.010108
2,hsa-let-7d-5p,PDE12,MIRT120920,-0.301715,0.029633
3,hsa-let-7d-5p,CALU,MIRT123316,-0.300611,0.030313
4,hsa-let-7d-5p,CSRNP3,,-0.316057,0.022328
...,...,...,...,...,...
579,hsa-miR-378c,TOLLIP,,-0.317707,0.021596
580,hsa-miR-378c,MEF2D,,-0.330541,0.016482
581,hsa-miR-378c,ABI2,,-0.312810,0.024002
582,hsa-miR-374c-5p,ATXN1,,-0.304957,0.027953


In [15]:
# Print the DataFrame of nodes for the group network
files['nodes']

Unnamed: 0,id,label,type
0,mir1,hsa-let-7b-5p,microRNA
1,mir2,hsa-let-7d-5p,microRNA
2,mir3,hsa-let-7f-5p,microRNA
3,mir4,hsa-miR-15a-5p,microRNA
4,mir5,hsa-miR-16-5p,microRNA
...,...,...,...
554,gene450,NOL4L,messenger RNA
555,gene451,CDK6,messenger RNA
556,gene452,USP46,messenger RNA
557,gene453,TOLLIP,messenger RNA


## Paired Normal Files

In [16]:
# Create the files for the construction of the Paired Normal tissue network
files = create_network_files('Paired Normal')

In [17]:
# Print the DataFrame of edges for the group network
files['edges']

Unnamed: 0,source,target,mirtarbase,correlation,qvalue
0,hsa-let-7a-5p,GIPC1,,-0.427751,0.008435
1,hsa-let-7a-5p,MMP11,,-0.339986,0.044181
2,hsa-let-7b-5p,GJC1,,-0.598291,0.000054
3,hsa-let-7b-5p,MEF2C,MIRT052262,-0.340807,0.043603
4,hsa-let-7b-5p,SOX6,,-0.426589,0.008635
...,...,...,...,...,...
5067,hsa-miR-370-5p,KCNB1,,-0.355776,0.033726
5068,hsa-miR-370-5p,ANTXR2,,-0.337184,0.046262
5069,hsa-miR-370-5p,PIP4P2,,-0.334586,0.048118
5070,hsa-miR-370-5p,ATP1A2,,-0.339986,0.044181


In [18]:
# Print the DataFrame of nodes for the group network
files['nodes']

Unnamed: 0,id,label,type
0,mir1,hsa-let-7a-5p,microRNA
1,mir2,hsa-let-7b-5p,microRNA
2,mir3,hsa-let-7c-5p,microRNA
3,mir4,hsa-let-7d-5p,microRNA
4,mir5,hsa-let-7e-5p,microRNA
...,...,...,...
2910,gene2745,MICOS10,messenger RNA
2911,gene2746,PDE3B,messenger RNA
2912,gene2747,ELMOD3,messenger RNA
2913,gene2748,UVRAG,messenger RNA


# Association Networks

## Functions

In [19]:
def infer_and_flag_associations(processed_dir_path):
    # Create a DataFrame for the filtered interactions
    file_path = os.path.join(
        processed_dir_path,
        INTERACTION_FILTERING_PARAMETERS['edges_file_name']
    )
    df_interactions = pd.read_csv(file_path)

    # Determine the set of neighbors for each node
    neighbors = defaultdict(set)
    for _, row in df_interactions.iterrows():
        neighbors[row['source']].add(row['target'])
    
    # Create a list to store the results
    results = list()

    # Compute the Jaccard index between each combination of different nodes
    for node_a, node_b in combinations(neighbors, 2):
        # Get the sets of neighbors from both nodes
        set_a = neighbors[node_a]
        set_b = neighbors[node_b]
        
        # Compute the Jaccard index between both sets
        intersection = len(set_a & set_b)
        union = len(set_a | set_b)
        jaccard = intersection / union if union else 0
        
        # Insert the result for this combination in the list
        results.append((node_a, node_b, jaccard))
    
    # Create a DataFrame for the inferred associations
    columns = ['node_a', 'node_b', 'association_index']
    df_associations = pd.DataFrame(results, columns=columns)
        
    # Flag the associations of interest
    min_index = ASSOCIATION_FILTERING_PARAMETERS['min_index']
    df_associations['is_association_of_interest'] = np.where(
        df_associations['association_index'] > min_index, 1, 0
    )
    
    return df_associations

In [20]:
def create_network_files(group):
    # Define the group processed directory path
    dir_base_name = (group.lower()).replace(' ', '-')
    processed_dir_path = CYTOSCAPE_PROCESSED_FILES_DIRS[dir_base_name]
    
    # Infer the associations and flag the associations of inteterest
    df_associations = infer_and_flag_associations(processed_dir_path)
    
    # Create the edges DataFrame
    df_edges = df_associations \
        .query('is_association_of_interest == 1') \
        .drop(columns=['is_association_of_interest']) \
        .rename(columns={
            'node_a': 'source',
            'node_b': 'target',
            'association_index': 'weight'
        }) \
        .round(2) \
        .reset_index(drop=True)
    
    # Create the nodes DataFrame
    df_source_nodes = df_edges[['source']].copy() \
        .rename(columns={'source': 'label'})
    df_target_nodes = df_edges[['target']].copy() \
        .rename(columns={'target': 'label'})
        
    df_nodes = pd.concat([df_source_nodes, df_target_nodes], ignore_index=True) \
        .drop_duplicates(ignore_index=True)
    df_nodes['id'] = [f'mir{i+1}' for i in range(len(df_nodes))]
    df_nodes['type'] = 'microRNA'
    df_nodes = df_nodes[['id', 'label', 'type']]
    
    # Store the DataFrames of edges
    edges_file_name = ASSOCIATION_FILTERING_PARAMETERS['edges_file_name']
    df_edges.to_csv(
        os.path.join(processed_dir_path, edges_file_name), index=False
    )
    
    # Store the DataFrames of nodes
    nodes_file_name = ASSOCIATION_FILTERING_PARAMETERS['nodes_file_name']
    df_nodes.to_csv(
        os.path.join(processed_dir_path, nodes_file_name), index=False
    )
    
    return {
        'edges': df_edges,
        'nodes': df_nodes,
    }

## Basal-like Files

In [21]:
# Compute the associations that will be in the Basal-like tumor tissue network
files = create_network_files('Basal-like')

In [22]:
# Print the DataFrame of edges for the group network
files['edges']

Unnamed: 0,source,target,weight
0,hsa-miR-17-5p,hsa-miR-20a-5p,0.57
1,hsa-miR-17-5p,hsa-miR-106a-5p,0.14
2,hsa-miR-18a-5p,hsa-miR-19a-3p,0.12
3,hsa-miR-18a-5p,hsa-miR-19b-3p,0.14
4,hsa-miR-19a-3p,hsa-miR-19b-3p,0.67
5,hsa-miR-20a-5p,hsa-miR-93-5p,0.12
6,hsa-miR-20a-5p,hsa-miR-130b-3p,0.17
7,hsa-miR-23a-3p,hsa-miR-151a-3p,0.5
8,hsa-miR-27a-3p,hsa-miR-128-3p,0.25
9,hsa-miR-29a-3p,hsa-miR-29b-3p,0.17


In [23]:
# Print the DataFrame of nodes for the group network
files['nodes']

Unnamed: 0,id,label,type
0,mir1,hsa-miR-17-5p,microRNA
1,mir2,hsa-miR-18a-5p,microRNA
2,mir3,hsa-miR-19a-3p,microRNA
3,mir4,hsa-miR-20a-5p,microRNA
4,mir5,hsa-miR-23a-3p,microRNA
5,mir6,hsa-miR-27a-3p,microRNA
6,mir7,hsa-miR-29a-3p,microRNA
7,mir8,hsa-miR-93-5p,microRNA
8,mir9,hsa-miR-221-3p,microRNA
9,mir10,hsa-miR-200b-3p,microRNA


## HER2-enriched Files

In [24]:
# Compute the associations that will be in the HER2-enriched tumor tissue network
files = create_network_files('HER2-enriched')

In [25]:
# Print the DataFrame of edges for the group network
files['edges']

Unnamed: 0,source,target,weight


In [26]:
# Print the DataFrame of nodes for the group network
files['nodes']

Unnamed: 0,id,label,type


## Luminal A Files

In [27]:
# Compute the associations that will be in the Luminal A tumor tissue network
files = create_network_files('Luminal A')

In [28]:
# Print the DataFrame of edges for the group network
files['edges']

Unnamed: 0,source,target,weight
0,hsa-miR-16-5p,hsa-miR-192-5p,0.11
1,hsa-miR-17-5p,hsa-miR-20a-5p,0.27
2,hsa-miR-20a-5p,hsa-miR-92a-3p,0.25
3,hsa-miR-33a-5p,hsa-miR-33b-5p,0.2
4,hsa-miR-29b-3p,hsa-miR-29c-3p,0.11
5,hsa-miR-30c-5p,hsa-miR-324-5p,0.5
6,hsa-miR-141-3p,hsa-miR-200a-3p,0.2
7,hsa-miR-365a-3p,hsa-miR-365b-3p,1.0
8,hsa-miR-148b-3p,hsa-miR-193b-3p,0.17
9,hsa-miR-148b-3p,hsa-miR-33b-5p,0.12


In [29]:
# Print the DataFrame of nodes for the group network
files['nodes']

Unnamed: 0,id,label,type
0,mir1,hsa-miR-16-5p,microRNA
1,mir2,hsa-miR-17-5p,microRNA
2,mir3,hsa-miR-20a-5p,microRNA
3,mir4,hsa-miR-33a-5p,microRNA
4,mir5,hsa-miR-29b-3p,microRNA
5,mir6,hsa-miR-30c-5p,microRNA
6,mir7,hsa-miR-141-3p,microRNA
7,mir8,hsa-miR-365a-3p,microRNA
8,mir9,hsa-miR-148b-3p,microRNA
9,mir10,hsa-miR-193b-3p,microRNA


## Luminal B Files

In [30]:
# Compute the associations that will be in the Luminal B tumor tissue network
files = create_network_files('Luminal B')

In [31]:
# Print the DataFrame of edges for the group network
files['edges']

Unnamed: 0,source,target,weight
0,hsa-let-7d-5p,hsa-miR-107,0.25
1,hsa-let-7d-5p,hsa-miR-181b-5p,0.14
2,hsa-let-7d-5p,hsa-let-7i-5p,0.17
3,hsa-let-7d-5p,hsa-miR-155-5p,0.11
4,hsa-miR-15a-5p,hsa-miR-30e-5p,1.0
5,hsa-miR-18a-5p,hsa-miR-181b-5p,0.18
6,hsa-miR-18a-5p,hsa-miR-502-3p,0.15
7,hsa-miR-26a-5p,hsa-miR-26b-5p,0.33
8,hsa-miR-96-5p,hsa-miR-182-5p,0.11
9,hsa-miR-30c-5p,hsa-miR-429,0.14


In [32]:
# Print the DataFrame of nodes for the group network
files['nodes']

Unnamed: 0,id,label,type
0,mir1,hsa-let-7d-5p,microRNA
1,mir2,hsa-miR-15a-5p,microRNA
2,mir3,hsa-miR-18a-5p,microRNA
3,mir4,hsa-miR-26a-5p,microRNA
4,mir5,hsa-miR-96-5p,microRNA
5,mir6,hsa-miR-30c-5p,microRNA
6,mir7,hsa-miR-181b-5p,microRNA
7,mir8,hsa-miR-212-3p,microRNA
8,mir9,hsa-miR-221-3p,microRNA
9,mir10,hsa-miR-222-3p,microRNA


## Paired Normal Files

In [33]:
# Compute the associations that will be in the Paired Normal tissue network
files = create_network_files('Paired Normal')

In [34]:
# Print the DataFrame of edges for the group network
files['edges']

Unnamed: 0,source,target,weight
0,hsa-let-7b-5p,hsa-let-7c-5p,0.38
1,hsa-miR-15a-5p,hsa-miR-16-5p,0.23
2,hsa-miR-17-5p,hsa-miR-93-5p,0.12
3,hsa-miR-17-5p,hsa-miR-106a-5p,0.12
4,hsa-miR-17-5p,hsa-miR-20b-5p,0.12
5,hsa-miR-26b-5p,hsa-miR-1271-5p,0.25
6,hsa-miR-29a-3p,hsa-miR-29c-3p,0.25
7,hsa-miR-30a-5p,hsa-miR-30d-5p,0.14
8,hsa-miR-30a-5p,hsa-miR-30e-5p,0.21
9,hsa-miR-33a-5p,hsa-miR-130a-3p,0.12


In [35]:
# Print the DataFrame of nodes for the group network
files['nodes']

Unnamed: 0,id,label,type
0,mir1,hsa-let-7b-5p,microRNA
1,mir2,hsa-miR-15a-5p,microRNA
2,mir3,hsa-miR-17-5p,microRNA
3,mir4,hsa-miR-26b-5p,microRNA
4,mir5,hsa-miR-29a-3p,microRNA
...,...,...,...
56,mir57,hsa-miR-365b-3p,microRNA
57,mir58,hsa-miR-378c,microRNA
58,mir59,hsa-miR-503-5p,microRNA
59,mir60,hsa-miR-299-5p,microRNA
