# **Create Files for the Association Networks of TCGA-BRCA**
TCGA: The Cancer Genome Atlas  
BRCA: Breast Invasive Carcinoma

# Importing Libraries and Configurations

In [1]:
import os
import sys
from collections import defaultdict
from itertools import combinations

import numpy as np
import pandas as pd

# Add project root to Python's path
sys.path.append(os.path.abspath(os.path.join('..', '..')))

from config import (
    ASSOCIATION_FILTERING_PARAMETERS,
    CYTOSCAPE_PROCESSED_FILES_DIRS,
    INTERACTION_FILTERING_PARAMETERS,
)

# Functions

In [2]:
def infer_and_flag_associations(processed_dir_path):
    # Create a DataFrame for the filtered interactions
    file_path = os.path.join(
        processed_dir_path,
        INTERACTION_FILTERING_PARAMETERS['edges_file_name']
    )
    df_interactions = pd.read_csv(file_path)

    # Determine the set of neighbors for each node
    neighbors = defaultdict(set)
    for _, row in df_interactions.iterrows():
        neighbors[row['source']].add(row['target'])
    
    # Create a list to store the results
    results = list()

    # Compute the Jaccard index between each combination of different nodes
    for node_a, node_b in combinations(neighbors, 2):
        # Get the sets of neighbors from both nodes
        set_a = neighbors[node_a]
        set_b = neighbors[node_b]
        
        # Compute the Jaccard index between both sets
        intersection = len(set_a & set_b)
        union = len(set_a | set_b)
        jaccard = intersection / union if union else 0
        
        # Insert the result for this combination in the list
        results.append((node_a, node_b, jaccard))
    
    # Create a DataFrame for the inferred associations
    columns = ['node_a', 'node_b', 'association_index']
    df_associations = pd.DataFrame(results, columns=columns)
        
    # Flag the associations of interest
    min_index = ASSOCIATION_FILTERING_PARAMETERS['min_index']
    df_associations['is_association_of_interest'] = np.where(
        df_associations['association_index'] > min_index, 1, 0
    )
    
    return df_associations

In [3]:
def create_network_files(group):
    # Define the group processed directory path
    dir_base_name = (group.lower()).replace(' ', '-')
    processed_dir_path = CYTOSCAPE_PROCESSED_FILES_DIRS[dir_base_name]
    
    # Infer the associations and flag the associations of inteterest
    df_associations = infer_and_flag_associations(processed_dir_path)
    
    # Create the edges DataFrame
    df_edges = df_associations \
        .query('is_association_of_interest == 1') \
        .drop(columns=['is_association_of_interest']) \
        .rename(columns={
            'node_a': 'source',
            'node_b': 'target',
            'association_index': 'weight'
        }) \
        .round(2) \
        .reset_index(drop=True)
    
    # Create the nodes DataFrame
    df_source_nodes = df_edges[['source']].copy() \
        .rename(columns={'source': 'label'})
    df_target_nodes = df_edges[['target']].copy() \
        .rename(columns={'target': 'label'})
        
    df_nodes = pd.concat([df_source_nodes, df_target_nodes], ignore_index=True) \
        .drop_duplicates(ignore_index=True)
    df_nodes['id'] = [f'mir{i+1}' for i in range(len(df_nodes))]
    df_nodes['type'] = 'microRNA'
    df_nodes = df_nodes[['id', 'label', 'type']]
    
    # Store the DataFrames of edges
    edges_file_name = ASSOCIATION_FILTERING_PARAMETERS['edges_file_name']
    df_edges.to_csv(
        os.path.join(processed_dir_path, edges_file_name), index=False
    )
    
    # Store the DataFrames of nodes
    nodes_file_name = ASSOCIATION_FILTERING_PARAMETERS['nodes_file_name']
    df_nodes.to_csv(
        os.path.join(processed_dir_path, nodes_file_name), index=False
    )
    
    return {
        'edges': df_edges,
        'nodes': df_nodes,
    }

# Association Network Files

## Basal-like

In [4]:
# Compute the associations that will be in the Basal-like tumor tissue network
files = create_network_files('Basal-like')

In [5]:
# Print the DataFrame of edges for the group network
files['edges']

Unnamed: 0,source,target,weight
0,hsa-miR-17-5p,hsa-miR-20a-5p,0.57
1,hsa-miR-17-5p,hsa-miR-106a-5p,0.14
2,hsa-miR-18a-5p,hsa-miR-19a-3p,0.12
3,hsa-miR-18a-5p,hsa-miR-19b-3p,0.14
4,hsa-miR-19a-3p,hsa-miR-19b-3p,0.67
5,hsa-miR-20a-5p,hsa-miR-93-5p,0.12
6,hsa-miR-20a-5p,hsa-miR-130b-3p,0.17
7,hsa-miR-23a-3p,hsa-miR-151a-3p,0.5
8,hsa-miR-27a-3p,hsa-miR-128-3p,0.25
9,hsa-miR-29a-3p,hsa-miR-29b-3p,0.17


In [6]:
# Print the DataFrame of nodes for the group network
files['nodes']

Unnamed: 0,id,label,type
0,mir1,hsa-miR-17-5p,microRNA
1,mir2,hsa-miR-18a-5p,microRNA
2,mir3,hsa-miR-19a-3p,microRNA
3,mir4,hsa-miR-20a-5p,microRNA
4,mir5,hsa-miR-23a-3p,microRNA
5,mir6,hsa-miR-27a-3p,microRNA
6,mir7,hsa-miR-29a-3p,microRNA
7,mir8,hsa-miR-93-5p,microRNA
8,mir9,hsa-miR-221-3p,microRNA
9,mir10,hsa-miR-200b-3p,microRNA


## HER2-enriched

In [7]:
# Compute the associations that will be in the HER2-enriched tumor tissue network
files = create_network_files('HER2-enriched')

In [8]:
# Print the DataFrame of edges for the group network
files['edges']

Unnamed: 0,source,target,weight


In [9]:
# Print the DataFrame of nodes for the group network
files['nodes']

Unnamed: 0,id,label,type


## Luminal A

In [10]:
# Compute the associations that will be in the Luminal A tumor tissue network
files = create_network_files('Luminal A')

In [11]:
# Print the DataFrame of edges for the group network
files['edges']

Unnamed: 0,source,target,weight
0,hsa-miR-16-5p,hsa-miR-192-5p,0.11
1,hsa-miR-17-5p,hsa-miR-20a-5p,0.27
2,hsa-miR-20a-5p,hsa-miR-92a-3p,0.25
3,hsa-miR-33a-5p,hsa-miR-33b-5p,0.2
4,hsa-miR-29b-3p,hsa-miR-29c-3p,0.11
5,hsa-miR-30c-5p,hsa-miR-324-5p,0.5
6,hsa-miR-141-3p,hsa-miR-200a-3p,0.2
7,hsa-miR-365a-3p,hsa-miR-365b-3p,1.0
8,hsa-miR-148b-3p,hsa-miR-193b-3p,0.17
9,hsa-miR-148b-3p,hsa-miR-33b-5p,0.12


In [12]:
# Print the DataFrame of nodes for the group network
files['nodes']

Unnamed: 0,id,label,type
0,mir1,hsa-miR-16-5p,microRNA
1,mir2,hsa-miR-17-5p,microRNA
2,mir3,hsa-miR-20a-5p,microRNA
3,mir4,hsa-miR-33a-5p,microRNA
4,mir5,hsa-miR-29b-3p,microRNA
5,mir6,hsa-miR-30c-5p,microRNA
6,mir7,hsa-miR-141-3p,microRNA
7,mir8,hsa-miR-365a-3p,microRNA
8,mir9,hsa-miR-148b-3p,microRNA
9,mir10,hsa-miR-193b-3p,microRNA


## Luminal B

In [13]:
# Compute the associations that will be in the Luminal B tumor tissue network
files = create_network_files('Luminal B')

In [14]:
# Print the DataFrame of edges for the group network
files['edges']

Unnamed: 0,source,target,weight
0,hsa-let-7d-5p,hsa-miR-107,0.25
1,hsa-let-7d-5p,hsa-miR-181b-5p,0.14
2,hsa-let-7d-5p,hsa-let-7i-5p,0.17
3,hsa-let-7d-5p,hsa-miR-155-5p,0.11
4,hsa-miR-15a-5p,hsa-miR-30e-5p,1.0
5,hsa-miR-18a-5p,hsa-miR-181b-5p,0.18
6,hsa-miR-18a-5p,hsa-miR-502-3p,0.15
7,hsa-miR-26a-5p,hsa-miR-26b-5p,0.33
8,hsa-miR-96-5p,hsa-miR-182-5p,0.11
9,hsa-miR-30c-5p,hsa-miR-429,0.14


In [15]:
# Print the DataFrame of nodes for the group network
files['nodes']

Unnamed: 0,id,label,type
0,mir1,hsa-let-7d-5p,microRNA
1,mir2,hsa-miR-15a-5p,microRNA
2,mir3,hsa-miR-18a-5p,microRNA
3,mir4,hsa-miR-26a-5p,microRNA
4,mir5,hsa-miR-96-5p,microRNA
5,mir6,hsa-miR-30c-5p,microRNA
6,mir7,hsa-miR-181b-5p,microRNA
7,mir8,hsa-miR-212-3p,microRNA
8,mir9,hsa-miR-221-3p,microRNA
9,mir10,hsa-miR-222-3p,microRNA


## Paired Normal

In [16]:
# Compute the associations that will be in the Paired Normal tissue network
files = create_network_files('Paired Normal')

In [17]:
# Print the DataFrame of edges for the group network
files['edges']

Unnamed: 0,source,target,weight
0,hsa-let-7b-5p,hsa-let-7c-5p,0.38
1,hsa-miR-15a-5p,hsa-miR-16-5p,0.23
2,hsa-miR-17-5p,hsa-miR-93-5p,0.12
3,hsa-miR-17-5p,hsa-miR-106a-5p,0.12
4,hsa-miR-17-5p,hsa-miR-20b-5p,0.12
5,hsa-miR-26b-5p,hsa-miR-1271-5p,0.25
6,hsa-miR-29a-3p,hsa-miR-29c-3p,0.25
7,hsa-miR-30a-5p,hsa-miR-30d-5p,0.14
8,hsa-miR-30a-5p,hsa-miR-30e-5p,0.21
9,hsa-miR-33a-5p,hsa-miR-130a-3p,0.12


In [18]:
# Print the DataFrame of nodes for the group network
files['nodes']

Unnamed: 0,id,label,type
0,mir1,hsa-let-7b-5p,microRNA
1,mir2,hsa-miR-15a-5p,microRNA
2,mir3,hsa-miR-17-5p,microRNA
3,mir4,hsa-miR-26b-5p,microRNA
4,mir5,hsa-miR-29a-3p,microRNA
...,...,...,...
56,mir57,hsa-miR-365b-3p,microRNA
57,mir58,hsa-miR-378c,microRNA
58,mir59,hsa-miR-503-5p,microRNA
59,mir60,hsa-miR-299-5p,microRNA
