In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, "../openTCGA/")
sys.path.insert(0, "../MultiOmicsGraphEmbedding/")

import networkx as nx
import numpy as np
import pandas as pd
import scipy.sparse as sps
import pickle
import matplotlib.pyplot as plt

from openomics import MultiOmics, Protein, MessengerRNA, MicroRNA, LncRNA
from openomics.database import *
from openomics.genomics import *

from moge.visualization.data import matrix_heatmap, plot_coo_matrix
from moge.visualization.embedding import visualize_embedding
from moge.network.semantic_similarity import *

In [2]:
proteinatlas = ProteinAtlas()
protein_expressions = proteinatlas.get_expressions(index="protein_name", type="RNA - ")

In [3]:
cohort_name = "ProteinAtlas"
gtex_data = MultiOmics(cohort_name)
gtex_data.add_omic(Protein(cohort_name, data=protein_expressions,
                             gene_index_by="protein_name", transposed=True,
                             columns=None))

# luad_data.build_samples()

Protein (154, 19651) , indexed by: protein_name


In [4]:
string = STRING(edge_attr=["score"])
biogrid = BioGRID(edge_attr=['Score', 'Throughput', 'Experimental System', 'Experimental System Type'])
disgenet = DisGeNet(path="https://www.disgenet.org/static/disgenet_ap1/files/downloads/", curated=True)
go = GeneOntology()

STRING: ['item_id_a', 'item_id_b', 'mode', 'action', 'is_directional', 'a_is_acting', 'score']



Columns (9,10,19,20) have mixed types.Specify dtype option on import or set low_memory=False.



BioGRID: ['#BioGRID Interaction ID', 'Entrez Gene Interactor A', 'Entrez Gene Interactor B', 'BioGRID ID Interactor A', 'BioGRID ID Interactor B', 'Systematic Name Interactor A', 'Systematic Name Interactor B', 'Official Symbol Interactor A', 'Official Symbol Interactor B', 'Synonyms Interactor A', 'Synonyms Interactor B', 'Experimental System', 'Experimental System Type', 'Author', 'Pubmed ID', 'Organism Interactor A', 'Organism Interactor B', 'Throughput', 'Score', 'Modification', 'Phenotypes', 'Qualifications', 'Tags', 'Source Database']


In [5]:
# Protein
gtex_data.Protein.annotate_genomics(string, index="protein_name", 
                                    columns=['protein_size', 'protein_id', 'annotation'])
gtex_data.Protein.annotate_sequences(string, index="protein_name")
gtex_data.Protein.annotations["gene_name"] = gtex_data.Protein.annotations.index.get_level_values("protein_name")
gtex_data.Protein.annotate_genomics(database=go, index="gene_name", columns=['go_id'])
gtex_data.Protein.annotate_diseases(database=disgenet, index="gene_name")
gtex_data.Protein.annotation_expressions = gtex_data.Protein.expressions.T

Seq protein_name collisions: 10


In [6]:
gtex_data.Protein.annotate_genomics(database=proteinatlas, index="protein_name", 
                                    columns=['Protein class', 'Uniprot', 'Chromosome', 'Position', 'Antibody',
                                             'Subcellular location'])

In [7]:
filted_go_annotations = go.filter_annotation(gtex_data.Protein.annotations["go_id"].str.split("|"))
filted_go_annotations

protein_name
A1BG       [GO:0005576, GO:0002576, GO:0008150, GO:004331...
A1CF       [GO:0003725, GO:0005634, GO:0003729, GO:000573...
A2M        [GO:0002576, GO:0007597, GO:0002020, GO:000551...
A2ML1      [GO:0005576, GO:0004867, GO:0002020, GO:000561...
A3GALT2    [GO:0030259, GO:0016757, GO:0032580, GO:004687...
                                 ...                        
ZXDC       [GO:0005634, GO:0030275, GO:0003700, GO:004687...
ZYG11A                                          [GO:0031462]
ZYG11B                  [GO:0031462, GO:0006515, GO:0032436]
ZYX        [GO:0043149, GO:0005634, GO:0005737, GO:000716...
ZZEF1                               [GO:0005509, GO:0008270]
Name: go_id, Length: 19651, dtype: object

In [8]:
gtex_data.Protein.annotations["go_id"] = go.add_predecessor_terms(filted_go_annotations, return_str=True)
gtex_data.Protein.annotations

Unnamed: 0_level_0,gene_name,protein_size,protein_id,annotation,sequence,go_id,disease_associations,Protein class,Uniprot,Chromosome,Position,Antibody,Subcellular location
protein_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
A1BG,A1BG,495,9606.ENSP00000263100,Alpha-1B-glycoprotein; Immunoglobulin like dom...,MSMLVVFLLLWGVTWGPVTEAAIFYETQPSLWAESESLLKPLANVT...,GO:0005576|GO:0002576|GO:0008150|GO:0043312|GO...,"[hepatomegaly, schizophrenia]","Plasma proteins, Predicted intracellular prote...",P04217,19,58345178-58353499,"CAB016673, HPA044252",
A1CF,A1CF,602,9606.ENSP00000378868,APOBEC1 complementation factor; Essential comp...,MEAVCLGTCPEPEASMSTAIPGLKKGNNALQSIILQTLLEKENGQR...,GO:0003725|GO:0005634|GO:0003729|GO:0005737|GO...,,Predicted intracellular proteins,Q9NQ94,10,50799409-50885675,"HPA037779, HPA044079",Nucleoplasm
A2M,A2M,1474,9606.ENSP00000323929,Alpha-2-macroglobulin; Is able to inhibit all ...,MGKNKLLHPSLVLLLLVLLPTDASVSGKPQYMVLVPSLLHTETTEK...,GO:0002576|GO:0007597|GO:0002020|GO:0005515|GO...,"[alzheimer's disease, malignant tumor of colon...","Cancer-related genes, Candidate cardiovascular...",P01023,12,9067664-9116229,"HPA002265, CAB017621",
A2ML1,A2ML1,1454,9606.ENSP00000299698,Alpha-2-macroglobulin-like protein 1; Is able ...,MWAQLLLGMLALSPAIAEELPNYLVTLPARLNFPSVQKVCLDLSPG...,GO:0005576|GO:0004867|GO:0002020|GO:0005615|GO...,"[noonan syndrome, otitis media, intellectual d...","Predicted intracellular proteins, Predicted se...",A8K2U0,12,8822472-8887001,"HPA038847, HPA038848",
A3GALT2,A3GALT2,340,9606.ENSP00000475261,"Alpha-1,3-galactosyltransferase 2; Synthesizes...",MALKEGLRAWKRIFWRQILLTLGLLGLFLYGLPKFRHLEALIPMGV...,GO:0030259|GO:0016757|GO:0032580|GO:0046872|GO...,,"Enzymes, Predicted membrane proteins",U3KPV4,1,33306766-33321098,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC,ZXDC,858,9606.ENSP00000374359,Zinc finger protein ZXDC; Cooperates with CIIT...,MDLPALLPAPTARGGQHGGGPGPLRRAPAPLGASPARRRLLLVRGP...,GO:0005634|GO:0030275|GO:0003700|GO:0046872|GO...,,"Predicted intracellular proteins, Transcriptio...",Q2QGD7,3,126437601-126475919,HPA049593,Nucleoli
ZYG11A,ZYG11A,759,9606.ENSP00000360583,Protein zyg-11 homolog A; Probably acts as tar...,MVHFLHPGHTPRNIVPPDAQKDALGCCVVQEEASPYTLVNICLNVL...,GO:0031462|GO:0031461|GO:1990234|GO:1902494|GO...,,Predicted intracellular proteins,Q6WRX3,1,52842511-52894998,"HPA030378, HPA030379",Nucleoplasm
ZYG11B,ZYG11B,744,9606.ENSP00000294353,Protein zyg-11 homolog B; Probably acts as tar...,MPEDQAGAAMEEASPYSLLDICLNFLTTHLEKFCSARQDGTLCLQE...,GO:0031462|GO:0006515|GO:0032436|GO:1990234|GO...,,Predicted intracellular proteins,Q9C0D3,1,52726467-52827342,HPA028156,"Golgi apparatus,Intermediate filaments"
ZYX,ZYX,572,9606.ENSP00000324422,Zyxin; Adhesion plaque protein. Binds alpha-ac...,MAAPRPSPAISVSVSAPAFYAPQKKFGPVVAPKPKVNPFRPGDSEP...,GO:0043149|GO:0005634|GO:0005737|GO:0007165|GO...,,"Plasma proteins, Predicted intracellular proteins",Q15942,7,143381080-143391111,"HPA004835, CAB009321, HPA073497, CAB075747",Focal adhesion sites


In [9]:
gtex_data.Protein.annotations["disease_associations"] = gtex_data.Protein.annotations["disease_associations"].map(
                lambda x: "|".join(x) if isinstance(x, list) else None)

In [10]:
gtex_data.Protein.annotations["disease_associations"]

protein_name
A1BG                              hepatomegaly|schizophrenia
A1CF                                                    None
A2M        alzheimer's disease|malignant tumor of colon|c...
A2ML1      noonan syndrome|otitis media|intellectual disa...
A3GALT2                                                 None
                                 ...                        
ZXDC                                                    None
ZYG11A                                                  None
ZYG11B                                                  None
ZYX                                                     None
ZZEF1                                                   None
Name: disease_associations, Length: 19651, dtype: object

In [11]:
import pickle
with open('data/proteinatlas_data.pickle', 'wb') as file:
#     gtex_data = pickle.load(file)
    pickle.dump(gtex_data, file)

# Build network

In [13]:
from moge.network.heterogeneous import HeterogeneousNetwork
network = HeterogeneousNetwork(multiomics=gtex_data, modalities=["Protein"])

Protein  nodes: 19651
Total nodes: 19651
Annotation columns: ['gene_name', 'protein_size', 'protein_id', 'annotation', 'sequence', 'go_id', 'disease_associations', 'Protein class', 'Uniprot', 'Chromosome', 'Position', 'Antibody', 'Subcellular location', 'omic']


In [None]:
network.annotations

In [None]:
# network.add_edges(biogrid.get_interactions(network.node_list, data=True, inclusive=True), 
#                   directed=False, 
#                   modalities=["Protein", "Protein"], database=biogrid.name())

In [14]:
network.add_edges(string.get_interactions(network.node_list, data=True, inclusive=True), 
                  directed=False, 
                  modalities=["Protein", "Protein"], database=string.name())

513817 edges added.


In [15]:
network.split_stratified(stratify_label="go_id", stratify_omic=False, directed=False,
                         n_splits=10, dropna=False, seed=42, verbose=True)

with open('data/proteinatlas_string_network.pickle', 'wb') as file:
#     network = pickle.load(file)
    pickle.dump(network, file)

full_network 19651 513817
label go_id filtered: 13576 with min_count=10



Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.



train 16508 test 1835
train 16505 test 1838
train 16518 test 1825
train 16508 test 1835
train 16519 test 1824
train 16521 test 1822
train 16508 test 1835
train 16480 test 1863
train 16504 test 1839
train 16516 test 1827


# Build Multiplex network

In [16]:
from moge.network.multiplex import MultiplexAttributedNetwork
from networkx import Graph, DiGraph

multi_network = MultiplexAttributedNetwork(multiomics=gtex_data,
                                     modalities=["Protein"], 
                                     layers={
                                         (Protein.name(), Protein.name(), "physical"): Graph,
                                         (Protein.name(), Protein.name(), "genetic"): Graph,
                                         (Protein.name(), Protein.name(), "correlation"): Graph,
                                     })

Protein  nodes: 19651 protein_name
Total nodes: 19651
All annotation columns (union): {'protein_id', 'annotation', 'Position', 'protein_size', 'Subcellular location', 'omic', 'Chromosome', 'Uniprot', 'sequence', 'Antibody', 'gene_name', 'disease_associations', 'Protein class', 'go_id'}


In [17]:
biogrid_edges = biogrid.get_interactions(multi_network.node_list, data=True, inclusive=True)
len(biogrid_edges)

343404

In [18]:
multi_network.add_edges([(u,v,d) for u,v,d in biogrid_edges if d["Experimental System Type"] == "physical"], 
                  database=biogrid.name(), directed=False,
                  layer=(Protein.name(), Protein.name(), "physical")
                 )
multi_network.add_edges([(u,v,d) for u,v,d in biogrid_edges if d["Experimental System Type"] == "genetic"], 
                  database=biogrid.name(), directed=False,
                  layer=(Protein.name(), Protein.name(), "genetic")
                 )

335401 edges added to self.networks[('Protein', 'Protein', 'physical')]
8003 edges added to self.networks[('Protein', 'Protein', 'genetic')]


In [19]:
ebunch = multi_network.get_correlation_edges(modality="Protein", node_list=multi_network.node_list, threshold=0.8)
len(ebunch)

309436

In [20]:
multi_network.add_edges(ebunch, 
                  database="ProteinAtlas", directed=False,
                  layer=(Protein.name(), Protein.name(), "correlation")
                 )

309436 edges added to self.networks[('Protein', 'Protein', 'correlation')]


In [21]:
multi_network.split_stratified(stratify_label="go_id", stratify_omic=False, 
                         n_splits=10, dropna=False, seed=42, verbose=True)

with open('data/proteinatlas_biogrid_multi_network.pickle', 'wb') as file:
#     multi_network = pickle.load(file)
    pickle.dump(multi_network, file)

label go_id filtered: 13576 with min_count=10



Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.



train 16508 test 1835
train 16511 test 1832
train 16502 test 1841
train 16505 test 1838
train 16499 test 1844
train 16519 test 1824
train 16509 test 1834
train 16524 test 1819
train 16509 test 1834
train 16501 test 1842
