In [11]:
import csv
import plotly.offline as py
import plotly.graph_objects as go
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from networkx.algorithms.community import k_clique_communities
from networkx.algorithms import community

## Load PPI edges

In [3]:
ppi_folder = "PPI_databases/"

In [4]:
ppi_edges_dict = {}

In [5]:
# Append iREF data to edge dict
import os

for file_name in os.listdir(ppi_folder):
    with open(ppi_folder + file_name,"r") as file:
        print(file_name)
        print(len(ppi_edges_dict))
        if not file_name.endswith(".sig"):
            continue
        for line in file.read().split("\n"):
            data = line.split()
            if len(data) != 13:
                continue
            if data[0] in ppi_edges_dict:
                ppi_edges_dict[data[0]].add(data[5])
            else:
                ppi_edges_dict[data[0]] = set([data[5]])

MIPS.sig
0
BioGRID.sig
346
.DS_Store
7013
figeys.sig
7013
HPRD.sig
7174
huMAP.sig
9818
IntAct_filtered_ppi_2017_07_13.sig
12056
KEGG.sig
12335
iREF_02-2018_Mouse-Human_KINASES_PPI.sig
12595
innateDB_filtered_ppi_2017_07_12.sig
13462
BioGRID_filtered_ppi_2018_02_26.sig
13515
DIP_filtered_ppi_2017_07_12.sig
13515
BioPLEX_ppi_2017_06_07.sig
13583
SNAVI.sig
14826
predictedPPI.sig
14888
Stelzl.sig
14934
BIND.sig
15410
pdzbase.sig
15813
ppid.sig
15813
MINT-03-2018_Mouse-Human_PPI.sig
15829
Biocarta.sig
16905


In [7]:
# compile a list of all the edges as tuples
ppi_edges_list = []
for gene_a,val in ppi_edges_dict.items():
    for gene_b in val:
        ppi_edges_list.append((gene_a,gene_b))
    
print(len(ppi_edges_list))

282532


In [8]:
# write the resulting list to a file
with open("ppi_edges_list.csv","w") as f_out:
    writer = csv.writer(f_out, delimiter=',', lineterminator='\n')
    writer.writerows(ppi_edges_list)



In [10]:
df = pd.read_csv("ppi_edges_list.csv", header=None)
df.head()

Unnamed: 0,0,1
0,S100A8,LGALS7B
1,S100A8,IGSF21
2,S100A8,IVL
3,S100A8,SERPINB3
4,S100A8,NCF2


## Load gene-gene coexpression edges

In [12]:
df = pd.read_feather("human_correlation_feather")
print(df.shape)
df.head()

(26415, 26415)


Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A2MP1,A4GALT,A4GNT,AAAS,AACS,AACSP1,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
0,1.0,0.311017,0.074197,0.011767,0.015465,-0.083539,0.02419,-0.023043,0.116815,0.005853,...,-0.020775,-0.065653,-0.034081,-0.002724,-0.02202,0.101489,0.020808,-0.093609,-0.02596,0.003271
1,0.311017,1.0,0.314577,-0.001876,-0.000682,-0.065426,0.020387,-0.047918,0.046401,-0.004944,...,-0.079724,-0.07168,-0.018917,0.047886,0.00318,0.002353,-0.007614,-0.086449,0.018525,0.003854
2,0.074197,0.314577,1.0,-0.028479,-0.017056,0.014158,0.014082,-0.021786,-0.059868,-0.022951,...,-0.097068,-0.067448,0.00196,0.018967,0.046186,-0.111606,-0.013873,0.028855,0.021196,-0.0371
3,0.011767,-0.001876,-0.028479,1.0,0.007315,0.038877,-0.005643,-0.02481,0.058989,0.031632,...,-0.021674,-0.047888,0.005581,0.0093,-0.008702,0.031462,0.01379,-0.05519,0.001249,0.018487
4,0.015465,-0.000682,-0.017056,0.007315,1.0,-0.035422,-0.008135,-0.002369,-0.003118,0.0114,...,-0.039716,-0.023186,0.007086,-0.010465,0.026512,0.061912,0.003802,-0.027152,0.010331,0.035132


In [13]:
# create array of gene names
names = df.columns.tolist()

# initialize dict
top_corr = {}

top_n = 10

for i in range(len(names)):
    col = df[names[i]]
    col.index = names
    col = col.sort_values(ascending=False)
    top = [ x for x in col.index.values[0:top_n + 1] if x != names[i] ]
    top_corr[names[i]] = top

In [14]:
df_top = pd.DataFrame(top_corr)
df_top.to_csv(f"top_{top_n}_correlation.csv", encoding='utf-8', index=False)
df_top.head()

Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A2MP1,A4GALT,A4GNT,AAAS,AACS,AACSP1,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
0,ITIH1,PAH,ITIH2,PPL,ARRDC5,ALPPL2,METTL7B,SNRPA,SLC35E3,TP53I13,...,CSE1L,MCM6,YBX1P4,CPS1,NSA2,CA5A,PPP1R3A,GNAI2,GAREM1,MAP2K6
1,AGXT,SLC2A2,AHSG,TGM1,RP11.927P21.9,KLRG2,SSTR5,RUVBL2,PEX26,RENBP,...,NPM1,TIMELESS,RP11.475I24.1,CKAP2P1,CICP27,FAM98C,TXLNB,TGFB1,DHRS12,HSBP1
2,CYP4A11,CPB2,IGFBP1,KLK13,CSPG4P11,FGF4,C2ORF82,PLK1,ZNF793,PRRT4,...,RAN,SAA3P,REM2,GLP2R,RPL10AP5,PCDHB1,TRDN,PXN,ATP5LP3,POT1
3,C8B,APOH,ITIH3,SCEL,SMIM2,NLRP7,KLHL4,CDC20,RPS6KA5,PPOX,...,LDHB,PSMC5,RP5.1147A1.1,RP11.91J3.1,RPL21P3,USHBP1,XIRP2,VASP,GPR162,SNRPGP14
4,SLC25A47,ALB,AGT,KRT78,RP11.123K3.4,DNMT3L,SPIC,KIF22,ZNF556,AQP7P1,...,PAICS,RFC3,SMCR8,MTNR1A,RPL37P1,ZBTB8B,PYGM,SPI1,RPL27A,GOLGA2P2Y


In [21]:
gene_edges_dict = df_top.to_dict('list')