In [1]:
import networkx as nx
import gzip
from scipy.sparse import csc_array
from newnetprop import netprop
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from cluster_drought_module_greedy import participation_coefficient
from stringInteractions2namedInteractions import create_aliasdict

In [2]:
proteindatafile = "gz/39947.protein.links.v11.5.txt.gz"
proteindatafile = "txt/ppi.tsv"

seedfile = "txt/string_seeds.txt"

In [3]:
# Create graph from file
graph = nx.Graph()
file = gzip.open(proteindatafile) if proteindatafile.split('.')[-1] == "gz" else open(proteindatafile)
line = file.readline()
line = file.readline()

while line:
    line = line.strip(' ').split()
    line[-1] = int(line[-1])
    
    graph.add_edge(line[0],line[1],weight=line[-1])

    line = file.readline()

file.close()

# Get seed list
with open(seedfile) as file:
    seeds = [line.strip() for line in file.readlines()]
seeds = set(seeds)

# Mark seed proteins
for node in seeds:
        try:
            graph.nodes[node]['isSeed'] = True
        except KeyError:
            continue

In [4]:
# Do RWR and assign probability scores
output_probs = netprop(graph,seeds)
output_probs = output_probs.reshape(output_probs.shape[0])

graphnodes = graph.nodes.items
outputs = {}
for node,score in zip(list(graph.nodes),output_probs):
    outputs[node] = float(score)

nx.set_node_attributes(graph,outputs,'rwr')

  A = csc_array(nx.adjacency_matrix(graph),dtype="float64")


In [5]:
aliases = create_aliasdict("gz/39947.protein.aliases.v11.5.txt.gz")
func = lambda x: [(key,val['BLAST_UniProt_ID']) for key,val in x.items()]
uniprots = dict(func(aliases))

In [6]:
def top_nodes_subgraph(graph,scorelist,topnum=0,minnum=0.05,attr='rwr'):

    out_idx = np.argsort(scorelist)
    nodelist = list(graph.nodes)
    sortednodes = [nodelist[i] for i in out_idx if graph.nodes[nodelist[i]][attr] >= minnum]
    sub = graph.subgraph(sortednodes[-1*topnum:])

    return sub

In [7]:
nodeslist = []
scoreslist = []

for cutoff in range(100,501,50):
    sub = top_nodes_subgraph(graph,output_probs,cutoff,minnum=0.01)
    louvain = nx.community.louvain_communities(sub)

    attrdict =  {}
    for i,nodes_in_clust in enumerate(louvain):
        for node in nodes_in_clust:
            attrdict[node] = i+1
    nx.set_node_attributes(sub,attrdict,'louvain')
    
    clustered = participation_coefficient(sub,'louvain')

    relabeled = nx.relabel_nodes(clustered,uniprots)

    temp = [(relabeled.nodes[node]['louvain_PC'],node) for node in relabeled.nodes]
    temp.sort(reverse=True)
    
    nodeslist.append ( [i[1] for i in temp] )
    scoreslist.append ( [i[0] for i in temp] )

In [8]:
trimmednodes = [nlist for nlist in nodeslist]
trimmedscores = [nlist for nlist in scoreslist]
allnodes = list(set([item for nodelist in trimmednodes for item in nodelist]))

In [9]:
scores = []
for j,run in enumerate(trimmednodes):
    runlist = [0]*len(allnodes)
    for i,node in enumerate(run):
        runlist[allnodes.index(node)] = trimmedscores[j][i]
    scores.append(runlist)

In [10]:
df = pd.DataFrame(scores)
df.columns = allnodes

test = df.sum(0)/df.shape[0]
bestones = test.sort_values(ascending=False)

In [11]:
reversed_uniprots = dict ( [(value,key) for key,value in uniprots.items()])
stringIDsOfHubs = [reversed_uniprots[i] for i in bestones.index]
isSeed = [hub in seeds for hub in stringIDsOfHubs]

In [12]:
avgdf = pd.DataFrame(bestones)
avgdf[2] = isSeed

In [13]:
avgdf[avgdf[0] > 0.5]

Unnamed: 0,0,2
SIK1_ORYSJ,1.0,True
Q0J3I9_ORYSJ,0.637768,False
Q6ZLF1_ORYSJ,0.637768,False
Q650T3_ORYSJ,0.608993,False
Q3V826_ORYSJ,0.567654,False
B7F9I5_ORYSJ,0.561168,False
SLR1_ORYSJ,0.561168,False
A0A0P0VDV3_ORYSJ,0.54938,False
A0A0N7KEK0_ORYSJ,0.54938,False
Q7XR67_ORYSJ,0.548405,False


In [14]:
len(avgdf.index)

313