In [1]:
import networkx as nx
import numpy as np
import pandas as pd
from collections import defaultdict
import sys
sys.path.append('./utils/')


import tqdm
import separation

In [3]:
def connected_component_subgraphs(G, copy=True):
    ## this function was removed from latest versions of networkx!!
    for c in nx.connected_components(G):
        if copy:
            yield G.subgraph(c).copy()
        else:
            yield G.subgraph(c)

In [4]:
interactome_file = './data/PPI_Dataset.csv'
mapping_file = './data/interactome_2019_merged_protAnnots.csv'
disease_genes_file = './data/TCM_Syndrome 5.tsv'

In [5]:
hi = pd.read_csv(interactome_file)
edges = zip(hi.proteinA_entrezid, hi.proteinB_entrezid)
hi = nx.Graph()
hi.add_edges_from(edges)

## consider only LCC
G = list(connected_component_subgraphs(hi))[0]
print (len(G.nodes()), len(G.edges()))

17341 311497


In [6]:
mapping = pd.read_csv(mapping_file,
                     index_col = 0)
symbol2entrez = {i:j for i,j in zip(mapping.Symbol, mapping.GeneID)}

In [7]:
disease2genes = {}
for i in open(disease_genes_file).readlines():
    v = i.rstrip().split('\t')
    disease = v[1]
    genes = v[2:]
    if len(genes) > 19:
        disease2genes[disease] = [int(i) for i in genes]

In [8]:
# disease2genes

In [9]:
diseasegenes = list(disease2genes.values())
diseasegenes = sum(diseasegenes, [])
diseasegenes = list(set(diseasegenes))
len(diseasegenes)

2835

In [10]:
dic = {'CEP43':11116, 'MTARC1':64757}
symbol2entrez.update(dic)

In [11]:
dt = pd.read_csv('./data/HCC_Dataset.csv')


# $S_{AB}$

In [12]:
sab = {}
for disease in tqdm.tqdm(disease2genes.keys()):

#disease = list(disease2genes.keys())[0]
    genes_A = set(dt.EntrezID) & set(G.nodes())
    genes_B = set(disease2genes[disease]) & set(G.nodes())
    d_A = separation.calc_single_set_distance(G,genes_A)
    d_B = separation.calc_single_set_distance(G,genes_B)

    # distances BETWEEN the two gene sets:
    d_AB = separation.calc_set_pair_distances(G,genes_A,genes_B)

    # calculate separation
    s_AB = d_AB - (d_A + d_B)/2.
    sab[disease] = s_AB

100%|██████████| 5/5 [06:05<00:00, 73.03s/it] 


In [13]:
dx = pd.DataFrame.from_dict(sab, orient='index')
dx = dx.reset_index()
dx.columns = ['disease', 'sab']
dx.head()

Unnamed: 0,disease,sab
0,syndrome of liver depression and spleen defici...,-0.028361
1,syndrome of liver-gallbladder dampness heat,0.429812
2,syndrome of liver heat and blood stasis,0.008063
3,syndrome of spleen deficiency and dampness sta...,0.011633
4,syndrome of liver-kidney yin deficiency,0.082561


In [18]:
dx.to_csv('./output/sab_all_syndrome_to_hcc.csv')