### Recreate GIANT Network (Tissue Specific Edges Only, with Hugo Annotations)

In [1]:
# Unzip Tissue specific (Top Edges) network from NETwas paper 
# (http://hb.flatironinstitute.org/download)

import gzip
import shutil

with gzip.open('brain_top.gz', 'rb') as f_in:
    with open('brain_top.txt', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
        f_in.close()
        f_out.close()


In [1]:
import pandas as pd
import numpy as np

In [2]:
import ndex2
import matplotlib.pyplot as plt
import networkx as nx

In [3]:
# first import network table
with open('brain_top.txt', 'r') as f:
    nw_net = pd.read_table(f, sep = '\t', header=None)
f.close()
    
nw_net.shape

(41668864, 3)

In [4]:
# sanity check to make sure it imported correctly 
# nw_net.rename(columns={0:"Node1", 1:"Node2", 2:"Post_Prob"}, inplace=True)
nw_net.iloc[0:1,:]

Unnamed: 0,0,1,2
0,1,100008589,0.101838


In [5]:
# From https://www.genenames.org/cgi-bin/download get Entrez to Hugo mappings, create dictionary

# define new dict hugo to ensembl
h_e = {}

# start adding to dict
with open('hugo_to_entrez.txt', 'r') as f:
    for line in f:
        (val,key) = line.split("\t")
        keys = key.strip("\n")
        h_e[keys] = val
        
f.close()

In [6]:
list(h_e.values())[1:5]

['A1BG', 'A1BG-AS1', 'A1CF', 'A2M']

deal with a few weird cases...

In [7]:
# get rid of key from header line 
del h_e['Entrez Gene ID']

In [8]:
h_e['']
# manual lookup gives gene ID 106480342

'ZYXP1'

In [9]:
# delete this key
del h_e['']

In [10]:
# make sure ID not already in values
106480342 in list(h_e.values())

False

In [11]:
# manually re-assign
h_e[106480342] = 'ZYXP1'

In [12]:
# make sure no duplicate genes in h_e
len(np.unique(list(h_e.values()))),len(h_e.values())

(38719, 38719)

In [13]:
# free up memory
# del nw_net

In [14]:
# make new network for networkX object for nw_net 
with open('brain_top.txt', 'rb') as f:
    nwnet = nx.read_edgelist(f, delimiter='\t', data=(('post_prob',float),))
f.close()

In [15]:
# get h_e in nw_net
not_there = []
there = {}

for n in list(nwnet.nodes()):
    
    # if in is a key of h_e, add to dict
    if n in list(h_e.keys()):
        there[n] = h_e[n]
    
    # if not, add to list 
    else:
        not_there.append(n)

len(not_there)
# there are 2650 nodes not in the h_e dict

2650

In [16]:
# check on these... 
not_there[0:5]

['100124402', '100126583', '100127889', '100127910', '100127955']

In [17]:
# add node attribute with fake names
nx.set_node_attributes(nwnet, values='', name='name')

In [18]:
# update node attribute with node names that are there
nx.set_node_attributes(nwnet, values=there, name='name')

In [19]:
# check that nodes now have names - list nodes 
list(nwnet.nodes())[1:10]

['100008589',
 '100009613',
 '10002',
 '100033414',
 '100037417',
 '100049587',
 '100101121',
 '100101933',
 '100124332']

In [21]:
[n for n in nw_nodelist if nw_nodelist.count(n) >1]

NameError: name 'nw_nodelist' is not defined

In [20]:
# check that nodes now have names - list corresponding names
nodenames = list(nx.get_node_attributes(nwnet, "name").values())
nodenames[1:10]

['RNA28SN5',
 'LINC02584',
 'NR2E3',
 'SNORD116-2',
 'DDTL',
 'SIGLEC14',
 'TTTY23B',
 'GNG5P4',
 'YBX2P1']

In [21]:
# iterate over nodes and add its node name 
#not_there = []

#for n in list(nwnet.nodes()):
    
    #if n in list(h_e.keys()):
        #nx.set_node_attributes(nwnet, values=h_e[n], name='name')
    #else:
        #not_there.append(n)

#len(not_there)

In [22]:
# save the not_there list to check ncbi for any updated IDs 
# (the network is 1 year old, so there may have neen changes.)
not_there = pd.DataFrame(not_there)

with open('not_there.txt', 'w') as f:
    not_there.to_csv(f, sep='\n',header=False, index=False)
    
f.close()

In [23]:
# used rentrez in R to generate the following file (see notebook) 
# to analyze the HGNC ID and the ENSEMBL gene ID for each of the not_there genes.

# upload file
with open('now_there.csv', 'r') as f:
    now_there = pd.read_csv(f, sep=' ', header=None)
    
f.close()

In [24]:
# check file
now_there.iloc[0:3,:]

Unnamed: 0,0,1
0,100124402,LOC100124402
1,100126583,LOC100126583
2,100127889,C10orf131


In [25]:
# make sure we still have 2650 
len(now_there)

2650

In [26]:
# make dict wit these ids
h_en = {}

# start adding to dict
for line in range(0,len(now_there)):
    h_en[str(now_there.iloc[line,0])] = now_there.iloc[line,1]
        
f.close()
len(list(h_en.keys()))

2650

In [27]:
# update node attribute with node names that are now there
nx.set_node_attributes(nwnet, values=h_en, name='name')

In [28]:
# get dict of nodenames
nw_nodedict = nx.get_node_attributes(nwnet, "name")
# get list of node names
nw_nodelist = list(nw_nodedict.values())

In [29]:
# make sure no duplicate node names
# there must be some duplicates that arose after adding the second round of names  
len(np.unique(nw_nodelist)),len(nw_nodelist)

(25668, 25689)

In [30]:
# check that some of the ones that weren't there are now there 
nodenames = nx.get_node_attributes(nwnet, "name")
nodenames['100124402'] # looks good

'LOC100124402'

In [31]:
# check to make sure post_prob (edge wts) there
# probs = list(nx.get_edge_attributes(nwnet, "post_prob").values())
# probs[1:10]

In [32]:
# del probs

In [33]:
# write pickle 
with open('nwnet', 'wb') as f:
    nx.write_gpickle(nwnet, f)
    
f.close()

In [None]:
# create cx to upload to ndex 
# nwnet_2upload = ndex2.create_nice_cx_from_networkx(nwnet)

In [None]:
# from ndex2 import niceCXNetwork as ndw

In [None]:
# upload to ndex
# nwnet_2upload.upload_to('http://test.ndexbio.org', 'ensilva', 'strgrl18')