In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, "../openTCGA/")

import networkx as nx
import numpy as np
import pandas as pd
import scipy.sparse as sps
import pickle
import matplotlib.pyplot as plt

from openomics import MultiOmics, Protein, MessengerRNA, MicroRNA, LncRNA
from openomics.database import *
from openomics.genomics import *

from moge.visualization.data import matrix_heatmap, plot_coo_matrix
from moge.visualization.embedding import visualize_embedding, plot_bokeh_graph
from moge.network.semantic_similarity import *

In [2]:
gtex = pd.read_table("/home/jonny/Bioinformatics_ExternalData/ProteinAtlas/rna_tissue_gtex.tsv")
gtex.rename(columns={"Gene":"protein_id", "Gene name": "protein_name"}, inplace=True)
gtex_nx = gtex.pivot_table(values="NX", index=["protein_name"], columns="Tissue")
gtex_nx

Tissue,adipose tissue,adrenal gland,amygdala,basal ganglia,breast,cerebellum,cerebral cortex,"cervix, uterine",colon,endometrium,...,skeletal muscle,skin,small intestine,spinal cord,spleen,stomach,testis,thyroid gland,urinary bladder,vagina
protein_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0
A1CF,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,2.4,0.1,...,0.1,0.1,4.4,0.1,0.1,0.2,0.1,0.1,0.1,0.1
A2M,49.8,17.7,9.7,12.2,39.6,3.3,6.7,30.5,22.6,41.1,...,12.6,10.2,23.1,16.1,25.3,25.8,5.1,29.5,63.7,23.6
A2ML1,0.0,0.0,0.1,0.2,0.0,0.0,0.0,18.1,0.0,0.0,...,0.0,10.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,28.7
A3GALT2,0.4,0.1,0.1,0.2,0.4,1.3,0.1,0.4,0.2,0.5,...,0.2,0.0,0.1,0.2,0.8,0.3,0.0,0.4,0.2,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC,13.4,9.7,5.8,6.5,12.6,9.9,5.4,11.8,9.2,14.4,...,28.9,18.0,9.7,7.4,10.4,9.8,9.0,11.2,14.5,11.2
ZYG11A,0.1,0.1,0.1,0.2,0.8,0.7,0.1,0.6,0.1,0.1,...,0.1,0.4,0.2,0.1,0.3,0.1,6.7,4.2,0.5,1.1
ZYG11B,7.8,9.1,10.8,11.1,6.5,21.7,16.1,7.7,10.4,9.9,...,49.8,6.1,5.8,8.4,4.7,6.3,6.1,8.3,9.2,7.0
ZYX,33.0,11.7,7.7,7.2,31.8,17.5,17.1,36.0,56.8,103.6,...,17.2,21.1,23.4,8.1,36.7,28.3,15.0,15.0,42.6,31.0


In [3]:
cohort_name = "GTEx"
gtex_data = MultiOmics(cohort_name)
gtex_data.add_omic(Protein(cohort_name, file_path=gtex_nx,
                             gene_index_by="protein_name", transposed=True,
                             columns=None, genes_col_name=None))

# luad_data.build_samples()

Protein (34, 18815) , indexed by: protein_name


In [4]:
string = STRING(edge_attr=["score"])
biogrid = BioGRID(edge_attr=['Score', 'Throughput', 'Experimental System', 'Experimental System Type'])
disgenet = DisGeNet(path="https://www.disgenet.org/static/disgenet_ap1/files/downloads/", curated=True)
go = GeneOntology()

STRING: ['item_id_a', 'item_id_b', 'mode', 'action', 'is_directional', 'a_is_acting', 'score']



Columns (9,10,19,20) have mixed types.Specify dtype option on import or set low_memory=False.



BioGRID: ['#BioGRID Interaction ID', 'Entrez Gene Interactor A', 'Entrez Gene Interactor B', 'BioGRID ID Interactor A', 'BioGRID ID Interactor B', 'Systematic Name Interactor A', 'Systematic Name Interactor B', 'Official Symbol Interactor A', 'Official Symbol Interactor B', 'Synonyms Interactor A', 'Synonyms Interactor B', 'Experimental System', 'Experimental System Type', 'Author', 'Pubmed ID', 'Organism Interactor A', 'Organism Interactor B', 'Throughput', 'Score', 'Modification', 'Phenotypes', 'Qualifications', 'Tags', 'Source Database']


In [10]:
# Protein
gtex_data.Protein.annotate_genomics(string, index="protein_name", 
                                    columns=['protein_size', 'protein_id', 'annotation'])
gtex_data.Protein.annotate_sequences(string, index="protein_name")
gtex_data.Protein.annotations["gene_name"] = gtex_data.Protein.annotations.index.get_level_values("protein_name")
gtex_data.Protein.annotate_genomics(database=go, index="gene_name", columns=['go_id'])
gtex_data.Protein.annotate_diseases(database=disgenet, index="gene_name")
gtex_data.Protein.annotation_expressions = gtex_data.Protein.expressions.T

In [6]:
filted_go_annotations = go.filter_annotation(gtex_data.Protein.annotations["go_id"].str.split("|"))
filted_go_annotations

protein_name
A1BG       [GO:1904813, GO:0005615, GO:0002576, GO:004331...
A1CF       [GO:0005783, GO:0005634, GO:0003727, GO:001060...
A2M        [GO:0005615, GO:0002576, GO:0022617, GO:000186...
A2ML1      [GO:0030414, GO:0005615, GO:0002020, GO:005254...
A3GALT2    [GO:0030259, GO:0005794, GO:0005975, GO:003198...
                                 ...                        
ZXDC       [GO:0005634, GO:0070742, GO:0046872, GO:000551...
ZYG11A                                          [GO:0031462]
ZYG11B                  [GO:0031462, GO:0032436, GO:0006515]
ZYX        [GO:0005634, GO:0007160, GO:0007165, GO:000591...
ZZEF1                               [GO:0005509, GO:0008270]
Name: go_id, Length: 18815, dtype: object

In [7]:
gtex_data.Protein.annotations["go_id"] = go.add_predecessor_terms(filted_go_annotations, return_str=True)
gtex_data.Protein.annotations

Unnamed: 0_level_0,gene_name,protein_size,protein_id,annotation,Transcript sequence,go_id,disease_associations
protein_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A1BG,A1BG,495,9606.ENSP00000263100,Alpha-1B-glycoprotein; Immunoglobulin like dom...,MSMLVVFLLLWGVTWGPVTEAAIFYETQPSLWAESESLLKPLANVT...,GO:1904813|GO:0005615|GO:0002576|GO:0043312|GO...,"[hepatomegaly, schizophrenia]"
A1CF,A1CF,602,9606.ENSP00000378868,APOBEC1 complementation factor; Essential comp...,MEAVCLGTCPEPEASMSTAIPGLKKGNNALQSIILQTLLEKENGQR...,GO:0005783|GO:0005634|GO:0003727|GO:0010609|GO...,
A2M,A2M,1474,9606.ENSP00000323929,Alpha-2-macroglobulin; Is able to inhibit all ...,MGKNKLLHPSLVLLLLVLLPTDASVSGKPQYMVLVPSLLHTETTEK...,GO:0005615|GO:0002576|GO:0022617|GO:0001869|GO...,"[alzheimer's disease, malignant tumor of colon..."
A2ML1,A2ML1,1454,9606.ENSP00000299698,Alpha-2-macroglobulin-like protein 1; Is able ...,MWAQLLLGMLALSPAIAEELPNYLVTLPARLNFPSVQKVCLDLSPG...,GO:0030414|GO:0005615|GO:0002020|GO:0052548|GO...,"[noonan syndrome, otitis media, intellectual d..."
A3GALT2,A3GALT2,340,9606.ENSP00000475261,"Alpha-1,3-galactosyltransferase 2; Synthesizes...",MALKEGLRAWKRIFWRQILLTLGLLGLFLYGLPKFRHLEALIPMGV...,GO:0030259|GO:0005794|GO:0005975|GO:0031982|GO...,
...,...,...,...,...,...,...,...
ZXDC,ZXDC,858,9606.ENSP00000374359,Zinc finger protein ZXDC; Cooperates with CIIT...,MDLPALLPAPTARGGQHGGGPGPLRRAPAPLGASPARRRLLLVRGP...,GO:0005634|GO:0070742|GO:0046872|GO:0005515|GO...,
ZYG11A,ZYG11A,759,9606.ENSP00000360583,Protein zyg-11 homolog A; Probably acts as tar...,MVHFLHPGHTPRNIVPPDAQKDALGCCVVQEEASPYTLVNICLNVL...,GO:0031462|GO:1990234|GO:0005622|GO:0000151|GO...,
ZYG11B,ZYG11B,744,9606.ENSP00000294353,Protein zyg-11 homolog B; Probably acts as tar...,MPEDQAGAAMEEASPYSLLDICLNFLTTHLEKFCSARQDGTLCLQE...,GO:0031462|GO:0032436|GO:0006515|GO:1903362|GO...,
ZYX,ZYX,572,9606.ENSP00000324422,Zyxin; Adhesion plaque protein. Binds alpha-ac...,MAAPRPSPAISVSVSAPAFYAPQKKFGPVVAPKPKVNPFRPGDSEP...,GO:0005634|GO:0007160|GO:0007165|GO:0005912|GO...,


In [12]:
gtex_data.Protein.annotations["disease_associations"] = gtex_data.Protein.annotations["disease_associations"].map(
                lambda x: "|".join(x) if isinstance(x, list) else None)

In [13]:
gtex_data.Protein.annotations["disease_associations"]

protein_name
A1BG                              hepatomegaly|schizophrenia
A1CF                                                    None
A2M        alzheimer's disease|malignant tumor of colon|c...
A2ML1      noonan syndrome|otitis media|intellectual disa...
A3GALT2                                                 None
                                 ...                        
ZXDC                                                    None
ZYG11A                                                  None
ZYG11B                                                  None
ZYX                                                     None
ZZEF1                                                   None
Name: disease_associations, Length: 18815, dtype: object

In [14]:
import pickle
with open('moge/data/ppi_gtex_data.pickle', 'wb') as file:
#     gtex_data = pickle.load(file)
    pickle.dump(gtex_data, file)

# Build network

In [22]:
from moge.network.heterogeneous import HeterogeneousNetwork
network = HeterogeneousNetwork(multiomics=gtex_data, modalities=["Protein"])

Protein  nodes: 18815
Total nodes: 18815
Annotation columns: ['gene_name', 'protein_size', 'protein_id', 'annotation', 'Transcript sequence', 'go_id', 'disease_associations', 'omic']


In [16]:
network.add_edges(biogrid.get_interactions(network.node_list, data=True, inclusive=True), 
                  directed=False, 
                  modalities=["Protein", "Protein"], database=biogrid.name())

333187 edges added.


In [23]:
network.add_edges(string.get_interactions(network.node_list, data=True, inclusive=True), 
                  directed=False, 
                  modalities=["Protein", "Protein"], database=string.name())

494114 edges added.


In [24]:
with open('moge/data/gtex_string_network.pickle', 'wb') as file:
#     network = pickle.load(file)
    pickle.dump(network, file)

# Build Multiplex network

In [18]:
from moge.network import MultiplexAttributedNetwork
from networkx import Graph, DiGraph

multi_network = MultiplexAttributedNetwork(multiomics=gtex_data,
                                     modalities=["Protein"], 
                                     layers={
                                         (Protein.name(), Protein.name(), "physical"): Graph,
                                         (Protein.name(), Protein.name(), "genetic"): Graph,
                                     })

Protein  nodes: 18815
Total nodes: 18815
All annotation columns (union): {'Transcript sequence', 'disease_associations', 'annotation', 'omic', 'protein_size', 'go_id', 'protein_id', 'gene_name'}
Annotation columns: ['gene_name', 'protein_size', 'protein_id', 'annotation', 'Transcript sequence', 'go_id', 'disease_associations', 'omic']


In [19]:
biogrid_edges = biogrid.get_interactions(multi_network.node_list, data=True, inclusive=True)
len(biogrid_edges)

333187

In [20]:
multi_network.add_edges([(u,v,d) for u,v,d in biogrid_edges if d["Experimental System Type"] == "physical"], 
                  database=biogrid.name(), directed=False,
                  layer=(Protein.name(), Protein.name(), "physical")
                 )
multi_network.add_edges([(u,v,d) for u,v,d in biogrid_edges if d["Experimental System Type"] == "genetic"], 
                  database=biogrid.name(), directed=False,
                  layer=(Protein.name(), Protein.name(), "genetic")
                 )

325264 edges added to self.networks[('Protein', 'Protein', 'physical')]
7923 edges added to self.networks[('Protein', 'Protein', 'genetic')]


In [21]:
with open('moge/data/gtex_biogrid_multi_network.pickle', 'wb') as file:
#     multi_network = pickle.load(file)
    pickle.dump(multi_network, file)