# Attaching HUGO symbols

In [4]:
import networkx as nx
import networkx.algorithms.community as cm
from modules import cytoscape as cs 

G = nx.read_weighted_edgelist('../datasets/gene_edges.tsv', delimiter="\t")
genes = G.nodes()

In [12]:
genes = [ gene for gene in genes ]
len(genes)

3498

It is necessary to fetch from NCBI database the name of the each gene and attach this information to the partitions.
This will be done using the e-util esummary from NCBI.

In [30]:
import requests

from xml.etree import ElementTree

def get_genes_names(geneIDs):
    """
    Get gene informations from the NCBI database using 
    esummary e-util. 
    """
    xml = fetch_from_ncbi(geneIDs)
    tree = ElementTree.fromstring(xml)
    return extract_genes_name(tree)
       

def fetch_from_ncbi(geneIDs):
    """
    Make a call to the NCBI esummary endpoint and retrieve
    the XML response
    """
    headers = {'Content-Type': 'application/xml'}
    base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=gene&'
    gene_url = base_url + 'id='
    for gene in geneIDs: 
        gene_url += f'{gene},'
    response = requests.get(gene_url, headers=headers)
    return response.content
    
    
def extract_genes_name(tree):
    """
    Parse the XML response to retrieve informations 
    """
    gene_map = {}
    dss = tree.find('DocumentSummarySet')
    summaries = dss.findall('DocumentSummary')
    for summary in summaries:
        uid = summary.get('uid')
        hugo = summary.find('Name').text
        gene_map[uid] = hugo
    return gene_map

Retrieve all the genes names using batch requests of 100 ids each

In [36]:
a = {'x': 1}
b = {'y': 2}
{**a, **b}

{'x': 1, 'y': 2}

In [58]:
import math

gene_map = {}
number_of_genes = len(genes)
batchsize = 100
genes_pointer = 0
total_iterations = math.ceil(number_of_genes / batchsize)

for current_iteration in range(total_iterations):
    
    increment = batchsize
    if (current_iteration + 1 == total_iterations):
        increment = number_of_genes - genes_pointer
    
    batchend = genes_pointer + increment
    batch = genes[genes_pointer:batchend]
    batch_gene_map = get_genes_names(batch)
    gene_map = { **gene_map, **batch_gene_map }
    genes_pointer += increment
    

In [None]:
gene_map

In [62]:
import pickle

f = open('hugo-map/gene-id-symbol-map.pickle', 'wb')
pickle.dump(gene_map, f)
f.close()

Read the serialization (just to make sure it works)

In [83]:
f = open('hugo-map/gene-id-symbol-map.pickle', 'rb')
gene_map = pickle.load(f)
f.close()

For each partition, for each cluster, add the name of the nodes.

In [95]:
from os import listdir
from os.path import isfile, join

partition_path = '../partitions/backup/'

partitions_files = [join(partition_path, f) for f in listdir(partition_path) if isfile(join(partition_path, f))]

In [99]:
import json
import os

for filepath in partitions_files:
    with open(filepath, 'r') as f:
        print(f'analyzing {filepath}')
        content = f.read()
        partition = json.loads(content)
        print('partition size: ', len(partition))
        
        for cluster in partition:
            for data in cluster:
                uid = data.get('data').get('id')
                uid = str(uid)
                if '-' not in uid:
                    name = gene_map.get(uid)
                    data['data']['symbol'] = name
        content_with_names = json.dumps(partition)
        filename = os.path.split(filepath)[-1]
        with open(f'../partitions/{filename}', 'w+') as nf:
            nf.write(content_with_names)

analyzing ../partitions/backup/lvn.json
partition size:  72
analyzing ../partitions/backup/sc.json
partition size:  63
analyzing ../partitions/backup/kl.json
partition size:  128
analyzing ../partitions/backup/gn.json
partition size:  69
analyzing ../partitions/backup/fluidc.json
partition size:  63
analyzing ../partitions/backup/cnm.json
partition size:  92
analyzing ../partitions/backup/n2v.json
partition size:  60
