# Defining the Associations for TCGA-BRCA MicroRNA Networks
TCGA: The Cancer Genome Atlas  
BRCA: Breast Invasive Carcinoma

# Importing Libraries and Configurations

In [1]:
import os
import sys
from collections import defaultdict
from itertools import combinations

import numpy as np
import pandas as pd

# Add project root to Python's path
sys.path.append(os.path.abspath(os.path.join('..', '..')))

from config import (
    CYTOSCAPE_PROCESSED_FILES_DIRS,
    INTERACTION_FILTERING_PARAMETERS,
)

# Function

In [2]:
def define_network_associations(group):
    # Define the group processed directory path
    dir_base_name = (group.lower()).replace(' ', '-')
    processed_dir_path = CYTOSCAPE_PROCESSED_FILES_DIRS[dir_base_name]
    
    # Create a DataFrame for the filtered interactions
    file_path = os.path.join(
        processed_dir_path,
        INTERACTION_FILTERING_PARAMETERS['interactions_file_name']
    )
    df_interactions = pd.read_csv(file_path)

    # Determine the set of neighbors for each node
    neighbors = defaultdict(set)
    for _, row in df_interactions.iterrows():
        neighbors[row['mirna']].add(row['mrna'])
    
    # Create a list to store the results
    results = list()

    # Compute the Jaccard index between each combination of different nodes
    for node_a, node_b in combinations(neighbors, 2):
        # Get the sets of neighbors from both nodes
        set_a = neighbors[node_a]
        set_b = neighbors[node_b]
        
        # Compute the Jaccard index between both sets
        intersection = len(set_a & set_b)
        union = len(set_a | set_b)
        jaccard = intersection / union if union else 0
        
        # Insert the result for this combination in the list
        results.append((node_a, node_b, jaccard))
    
    # Create a DataFrame for the associations of interest
    columns = ['node_a', 'node_b', 'jaccard_index']
    df_associations = pd.DataFrame(results, columns=columns) \
        .query('jaccard_index > 0.1') \
        .sort_values('jaccard_index', ascending=True) \
        .reset_index(drop=True)
    
    # Store the DataFrame of associations of interest
    file_name = INTERACTION_FILTERING_PARAMETERS['associations_file_name']
    df_associations.to_csv(os.path.join(processed_dir_path, file_name), index=False)
    
    return df_associations

# Association Definition

## Basal-like

In [3]:
# Compute the associations that will be in the Basal-like tumor tissue network
define_network_associations('Basal-like')

Unnamed: 0,node_a,node_b,jaccard_index
0,hsa-miR-18a-5p,hsa-miR-19a-3p,0.125
1,hsa-miR-20a-5p,hsa-miR-93-5p,0.125
2,hsa-miR-93-5p,hsa-miR-106b-5p,0.130435
3,hsa-miR-17-5p,hsa-miR-106a-5p,0.136364
4,hsa-miR-18a-5p,hsa-miR-19b-3p,0.142857
5,hsa-miR-200b-3p,hsa-miR-200c-3p,0.157895
6,hsa-miR-20a-5p,hsa-miR-130b-3p,0.166667
7,hsa-miR-29a-3p,hsa-miR-29b-3p,0.166667
8,hsa-miR-221-3p,hsa-miR-222-3p,0.25
9,hsa-miR-27a-3p,hsa-miR-128-3p,0.25


## HER2-enriched

In [4]:
# Compute the associations that will be in the HER2-enriched tumor tissue network
define_network_associations('HER2-enriched')

Unnamed: 0,node_a,node_b,jaccard_index


## Luminal A

In [5]:
# Compute the associations that will be in the Luminal A tumor tissue network
define_network_associations('Luminal A')

Unnamed: 0,node_a,node_b,jaccard_index
0,hsa-miR-16-5p,hsa-miR-192-5p,0.111111
1,hsa-miR-193b-3p,hsa-miR-33b-5p,0.111111
2,hsa-miR-29b-3p,hsa-miR-29c-3p,0.114286
3,hsa-miR-148b-3p,hsa-miR-33b-5p,0.125
4,hsa-miR-148b-3p,hsa-miR-193b-3p,0.166667
5,hsa-miR-141-3p,hsa-miR-200a-3p,0.2
6,hsa-miR-33a-5p,hsa-miR-33b-5p,0.2
7,hsa-miR-20a-5p,hsa-miR-92a-3p,0.25
8,hsa-miR-17-5p,hsa-miR-20a-5p,0.272727
9,hsa-miR-30c-5p,hsa-miR-324-5p,0.5


## Luminal B

In [6]:
# Compute the associations that will be in the Luminal B tumor tissue network
define_network_associations('Luminal B')

Unnamed: 0,node_a,node_b,jaccard_index
0,hsa-miR-96-5p,hsa-miR-182-5p,0.105263
1,hsa-let-7d-5p,hsa-miR-155-5p,0.111111
2,hsa-miR-181b-5p,hsa-miR-155-5p,0.111111
3,hsa-miR-221-3p,hsa-miR-222-3p,0.111111
4,hsa-miR-193a-3p,hsa-miR-193b-3p,0.111111
5,hsa-miR-193b-3p,hsa-miR-374c-5p,0.111111
6,hsa-miR-106b-5p,hsa-miR-502-3p,0.111111
7,hsa-let-7i-5p,hsa-miR-155-5p,0.125
8,hsa-miR-30b-5p,hsa-miR-590-5p,0.133333
9,hsa-miR-128-3p,hsa-miR-362-5p,0.142857


## Paired Normal

In [7]:
# Compute the associations that will be in the Paired Normal tissue network
define_network_associations('Paired Normal')

Unnamed: 0,node_a,node_b,jaccard_index
0,hsa-miR-181b-5p,hsa-miR-200c-3p,0.103448
1,hsa-miR-205-5p,hsa-miR-34c-5p,0.105263
2,hsa-miR-34c-5p,hsa-miR-146b-5p,0.105263
3,hsa-miR-181c-5p,hsa-miR-181d-5p,0.106383
4,hsa-miR-200a-3p,hsa-miR-429,0.106383
5,hsa-miR-96-5p,hsa-miR-9-5p,0.107143
6,hsa-miR-204-5p,hsa-miR-125b-5p,0.111111
7,hsa-miR-195-5p,hsa-miR-424-5p,0.111111
8,hsa-miR-200b-3p,hsa-miR-200a-3p,0.114754
9,hsa-miR-17-5p,hsa-miR-20b-5p,0.118644
