In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, "../openTCGA/")

import os
import networkx as nx
import numpy as np
import pandas as pd
import scipy.sparse as sps
import pickle
import matplotlib.pyplot as plt

from openomics import MultiOmics, Protein, MessengerRNA, MicroRNA, LncRNA
from openomics.database import *
from openomics.genomics import *

from moge.visualization.data_viz import matrix_heatmap, plot_coo_matrix
from moge.visualization.embedding_viz import visualize_embedding, plot_bokeh_graph
from moge.visualization.network_viz import graph_viz
from moge.network.semantic_similarity import *

# Import the TCGA cancer data

In [2]:
pd.set_option('mode.chained_assignment', 'raise')

In [3]:
cohort_folder = "/data/datasets/Bioinformatics_ExternalData/tcga-assembler/LUAD/"
cohort_name = "LUAD"
luad_data = MultiOmics(cohort_name)
luad_data.add_clinical_data(
    clinical_data=os.path.join(cohort_folder,"clinical/nationwidechildrens.org_clinical_patient.txt"))

luad_data.add_omic(Protein(cohort_name, file_path=os.path.join(cohort_folder, "protein_rppa/protein_RPPA.txt"),
                                     gene_index_by="protein_name",
                                     columns="GeneSymbol|TCGA", genes_col_name="GeneSymbol"))

luad_data.add_omic(MessengerRNA(cohort_name, file_path=os.path.join(cohort_folder, "gene_exp", "geneExp.txt"), 
                                gene_index_by="gene_name",
                                columns="GeneSymbol|TCGA", genes_col_name="GeneSymbol"))

luad_data.add_omic(MicroRNA(cohort_name, file_path=os.path.join(cohort_folder, "mirna/", "miRNAExp__RPM.txt"), 
                            gene_index_by="gene_name",
                            columns="GeneSymbol|TCGA", genes_col_name="GeneSymbol"))

luad_data.add_omic(LncRNA(cohort_name, file_path=os.path.join(cohort_folder, "lncrna", "TCGA-rnaexpr.tsv"), 
                          gene_index_by="gene_id",
                          columns="Gene_ID|TCGA", genes_col_name="Gene_ID"))
luad_data.build_samples()





Protein (364, 154) , indexed by: protein_name
MessengerRNA (576, 20472) , indexed by: gene_name
MicroRNA (494, 1870) , indexed by: gene_name
LncRNA (546, 12727) , indexed by: gene_id


In [7]:
gencode = GENCODE(path="ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/",
                  file_resources={"long_noncoding_RNAs.gtf": "gencode.v32.long_noncoding_RNAs.gtf.gz",
                                  "basic.annotation.gtf": "gencode.v32.basic.annotation.gtf.gz",
                                  "lncRNA_transcripts.fa": "gencode.v32.lncRNA_transcripts.fa.gz",
                                  "transcripts.fa": "gencode.v32.transcripts.fa.gz"},
                  remove_version_num=True, replace_U2T=True)

rnacentral = RNAcentral(path="ftp://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/",
                       file_resources={"rnacentral_rfam_annotations.tsv":"go_annotations/rnacentral_rfam_annotations.tsv.gz",
                                      "gencode.tsv":"id_mapping/database_mappings/gencode.tsv"},
                       )

mirbase = MirBase(path="ftp://mirbase.org/pub/mirbase/CURRENT/", sequence="hairpin", species="Homo sapiens", 
                  species_id=9606, replace_U2T=True)

ensembl = EnsemblGenes()

go = GeneOntology()

gtex = GTEx(path="https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/")

Fetching file from: ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/ gencode.v32.long_noncoding_RNAs.gtf.gz
Fetching file from: ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/ gencode.v32.basic.annotation.gtf.gz
Fetching file from: ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/ gencode.v32.lncRNA_transcripts.fa.gz
Fetching file from: ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/ gencode.v32.transcripts.fa.gz


INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_name', 'level', 'hgnc_id', 'tag', 'havana_gene', 'transcript_id', 'transcript_type', 'transcript_name', 'transcript_support_level', 'havana_transcript', 'exon_number', 'exon_id', 'ont']
INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_name', 'level', 'hgnc_id', 'havana_gene', 'transcript_id', 'transcript_type', 'transcript_name', 'transcript_support_level', 'tag', 'havana_transcript', 'exon_number', 'exon_id', 'ont', 'protein_id', 'ccdsid']


GENCODE: ['index', 'seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'gene_id', 'gene_type', 'gene_name', 'level', 'hgnc_id', 'tag', 'havana_gene', 'transcript_id', 'transcript_type', 'transcript_name', 'transcript_support_level', 'havana_transcript', 'exon_number', 'exon_id', 'ont', 'protein_id', 'ccdsid']
Fetching file from: ftp://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/ go_annotations/rnacentral_rfam_annotations.tsv.gz
Fetching file from: ftp://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/ id_mapping/database_mappings/gencode.tsv
RNAcentral: ['index', 'RNAcentral id', 'database', 'transcript_id', 'species', 'RNA type', 'gene_name', 'go_id', 'Rfams']
Fetching file from: ftp://mirbase.org/pub/mirbase/CURRENT/ aliases.txt.gz
Fetching file from: ftp://mirbase.org/pub/mirbase/CURRENT/ mature.fa.gz
Fetching file from: ftp://mirbase.org/pub/mirbase/CURRENT/ hairpin.fa.gz
Fetching file from: ftp://ftp.ebi.ac.uk/pub/databases/RNAcentral/curr


Columns (10) have mixed types.Specify dtype option on import or set low_memory=False.



EnsemblGenes ['gene_id', 'gene_name', 'transcript_id', 'transcript_name', 'chromosome_name', 'transcript_start', 'transcript_end', 'transcript_length', 'gene_biotype', 'transcript_biotype', 'Rfams', 'go_id']
Fetching file from: http://geneontology.org/gene-associations/ goa_human.gaf.gz
Fetching file from: http://purl.obolibrary.org/obo/go/ go-basic.obo
Fetching file from: http://geneontology.org/gene-associations/ goa_human_rna.gaf.gz
Fetching file from: http://geneontology.org/gene-associations/ goa_human_isoform.gaf.gz
GeneOntology: ['index', 'DB', 'gene_id', 'gene_name', 'Qualifier', 'go_id', 'DB:Reference', 'Evidence', 'With', 'Aspect', 'DB_Object_Name', 'Synonym', 'DB_Object_Type', 'Taxon_ID', 'Date', 'Assigned_By', 'Annotation_Extension', 'Gene_Product_Form_ID']
Fetching file from: https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/ GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz
Fetching file from: https://storage.googleapis.com/gtex_analysis_v8/annot

In [3]:
# interaction datasets
lncbase = LncBase("/data/datasets/Bioinformatics_ExternalData/lncBase/", strip_mirna_name=True)
mirtarbase = MiRTarBase(path="/data/datasets/Bioinformatics_ExternalData/miRTarBase/",strip_mirna_name=True)
string = STRING()
lncrna2target = LncRNA2Target(path="/data/datasets/Bioinformatics_ExternalData/lncrna2target/",
     file_resources={"lncRNA_target_from_high_throughput_experiments.txt":
                     "/data/datasets/Bioinformatics_ExternalData/lncrna2target/lncRNA_target_from_high_throughput_experiments.txt"}, 
                              version="high_throughput")
# biogrid = BioGRID()


Columns (6) have mixed types.Specify dtype option on import or set low_memory=False.



Name: LncRNA2Target
Type: DiGraph
Number of nodes: 28911
Number of edges: 65655
Average in degree:   2.2709
Average out degree:   2.2709


In [9]:
# Disease association datasets
disgenet = DisGeNet(path="https://www.disgenet.org/static/disgenet_ap1/files/downloads/", curated=True)
hmdd = HMDD(path="http://www.cuilab.cn/static/hmdd3/data/")
lncrnadisease = LncRNADisease(path="http://www.cuilab.cn/files/images/ldd/", species="Human")
# malacards = MalaCards()

Fetching file from: https://www.disgenet.org/static/disgenet_ap1/files/downloads/ curated_gene_disease_associations.tsv.gz
Fetching file from: https://www.disgenet.org/static/disgenet_ap1/files/downloads/ all_gene_disease_associations.tsv.gz
DisGeNet: ['index', 'gene_name', 'disease_associations', 'score']
Fetching file from: http://www.cuilab.cn/static/hmdd3/data/ alldata.txt
HMDD: ['index', 'category', 'gene_name', 'disease_associations', 'pmid', 'description']
Fetching file from: http://www.cuilab.cn/files/images/ldd/ data_v2017.txt
LncRNADisease: ['index', 'gene_name', 'disease_associations', 'Dysfunction type', 'Description', 'Chr', 'Start', 'End', 'Strand', 'Species', 'Alias', 'Sequence', 'Reference']
Fetching file from: http://zdzlab.einstein.yu.edu/1/hedd/ download.action.php?filename=DataDownload/MalaCards.csv
MalaCards: ['index', 'id', 'gene_name', 'maladySymbol', 'maladySlug', 'disease_associations', 'score']


In [10]:
# LncRNA
luad_data.LncRNA.annotate_genomics(gencode, index="gene_id", 
                                   columns=['feature', 'start', 'end', 'strand', 'tag', 'havana_gene'])
luad_data.LncRNA.annotate_genomics(database=ensembl, index='gene_id', 
                                   columns=['gene_name', 'transcript_id', 'transcript_name', 
                                         'chromosome_name', 'transcript_start', 'transcript_end', 'transcript_length',
                                         'Rfams', 'go_id', 'gene_biotype', 'transcript_biotype'])
luad_data.LncRNA.annotate_genomics(database=rnacentral, index='gene_name',
                                   columns=['Rfams', 'go_id', 'gene_name'])
luad_data.LncRNA.annotate_diseases(lncrnadisease, index="gene_name", )
luad_data.LncRNA.annotate_sequences(gencode, index="gene_id", omic="LncRNA", agg_sequences="longest")
luad_data.LncRNA.annotations.info()

INFO: You can pass in a list of transcript biotypes to filter using the argument 'biotypes'.
<class 'pandas.core.frame.DataFrame'>
Index: 12727 entries, ENSG00000005206 to ENSGR0000270726
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   gene_name             11848 non-null  object
 1   feature               11840 non-null  object
 2   start                 11840 non-null  object
 3   end                   11840 non-null  object
 4   strand                11840 non-null  object
 5   tag                   11840 non-null  object
 6   havana_gene           11840 non-null  object
 7   transcript_id         11848 non-null  object
 8   transcript_name       11848 non-null  object
 9   chromosome_name       11848 non-null  object
 10  transcript_start      11848 non-null  object
 11  transcript_end        11848 non-null  object
 12  transcript_length     11848 non-null  object
 13  Rfams                 71

In [11]:
# MicroRNA
luad_data.MicroRNA.annotate_genomics(
    database=mirbase, index="gene_name", 
    columns=['mirbase id', 'RNAcentral id', 'database'],)
luad_data.MicroRNA.annotate_genomics(
    database=rnacentral, index="RNAcentral id",
    columns=['transcript_id', 'RNA type', 'go_id', 'Rfams']
)
luad_data.MicroRNA.annotate_genomics(
    database=ensembl, index='gene_name',
    columns=['gene_name', 'transcript_id', 'transcript_name', 
             'chromosome_name', 'transcript_start', 'transcript_end', 'transcript_length',
             'Rfams', 'go_id', 'gene_biotype', 'transcript_biotype'])
luad_data.MicroRNA.annotate_diseases(hmdd, index="gene_name", )
luad_data.MicroRNA.annotate_sequences(mirbase, index="gene_name", omic="MicroRNA", agg_sequences="all")
luad_data.MicroRNA.annotations.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1870 entries, hsa-let-7a-1 to hsa-mir-99b
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   RNAcentral id         1839 non-null   object
 1   mirbase id            1839 non-null   object
 2   database              1839 non-null   object
 3   transcript_id         487 non-null    object
 4   RNA type              487 non-null    object
 5   go_id                 487 non-null    object
 6   Rfams                 487 non-null    object
 7   transcript_name       2 non-null      object
 8   chromosome_name       2 non-null      object
 9   transcript_start      2 non-null      object
 10  transcript_end        2 non-null      object
 11  transcript_length     2 non-null      object
 12  gene_biotype          2 non-null      object
 13  transcript_biotype    2 non-null      object
 14  disease_associations  934 non-null    object
 15  Transcript sequence   182

In [12]:
# MessengerRNA
luad_data.MessengerRNA.annotate_genomics(database=rnacentral, index="gene_name",
                                columns=['gene_name', 'transcript_id', 'RNA type', 'go_id', 'Rfams'])
luad_data.MessengerRNA.annotate_genomics(database=ensembl, index='gene_name',
                                columns=['gene_id', 'transcript_id', 'transcript_name', 
                                         'chromosome_name', 'transcript_start', 'transcript_end', 'transcript_length',
                                         'Rfams', 'go_id', 'gene_biotype', 'transcript_biotype'])
luad_data.MessengerRNA.annotate_sequences(gencode, index="gene_name", omic="MessengerRNA", 
                                          agg_sequences="longest", biotypes=["protein_coding"])
luad_data.MessengerRNA.annotate_diseases(disgenet, index="gene_name", )
luad_data.MessengerRNA.annotations.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20472 entries, A1BG to tAKR
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   transcript_id         19443 non-null  object
 1   RNA type              374 non-null    object
 2   go_id                 18242 non-null  object
 3   Rfams                 386 non-null    object
 4   gene_id               19443 non-null  object
 5   transcript_name       19443 non-null  object
 6   chromosome_name       19443 non-null  object
 7   transcript_start      19443 non-null  object
 8   transcript_end        19443 non-null  object
 9   transcript_length     19443 non-null  object
 10  gene_biotype          19443 non-null  object
 11  transcript_biotype    19443 non-null  object
 12  Transcript sequence   18260 non-null  object
 13  disease_associations  8883 non-null   object
dtypes: object(14)
memory usage: 2.3+ MB


In [13]:
# Protein
luad_data.Protein.annotate_sequences(string, index="protein_name")
luad_data.Protein.annotate_genomics(string, index="protein_name", 
                                    columns=['protein_size', 'protein_id', 'annotation'])
luad_data.Protein.annotations["gene_name"] = luad_data.Protein.annotations.index.get_level_values("protein_name")
luad_data.Protein.annotate_genomics(database=go, index="gene_name", columns=['go_id'])
luad_data.Protein.annotate_diseases(malacards, index="gene_name", )
luad_data.Protein.annotations.info()

<class 'pandas.core.frame.DataFrame'>
Index: 154 entries, ABL1 to YWHAZ
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   gene_name             154 non-null    object
 1   Transcript sequence   150 non-null    object
 2   protein_size          150 non-null    object
 3   protein_id            150 non-null    object
 4   annotation            150 non-null    object
 5   go_id                 152 non-null    object
 6   disease_associations  132 non-null    object
dtypes: object(7)
memory usage: 9.6+ KB


In [14]:
# luad_data.LncRNA.annotate_diseases(malacards, index="gene_name", )
# luad_data.MicroRNA.annotate_diseases(malacards, index="gene_name", )
# luad_data.MessengerRNA.annotate_diseases(malacards, index="gene_name", )
# luad_data.Protein.annotate_diseases(malacards, index="gene_name", )

In [15]:
luad_data.MessengerRNA.annotate_expressions(database=gtex, index="gene_name")
luad_data.LncRNA.annotate_expressions(database=gtex, index="gene_id")
luad_data.MicroRNA.annotate_expressions(database=gtex, index="gene_name")
# luad_data.Protein.annotate_expressions(database=gtex, index="gene_name")

In [16]:
LNC = luad_data.LncRNA.get_annotations()
MIR = luad_data.MicroRNA.get_annotations()
GE = luad_data.MessengerRNA.get_annotations()
PRO = luad_data.Protein.get_annotations()
print(GE.info())
print(MIR.info())
print(LNC.info())
print(PRO.info())

<class 'pandas.core.frame.DataFrame'>
Index: 20472 entries, A1BG to tAKR
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   transcript_id         19443 non-null  object
 1   RNA type              374 non-null    object
 2   go_id                 18242 non-null  object
 3   Rfams                 386 non-null    object
 4   gene_id               19443 non-null  object
 5   transcript_name       19443 non-null  object
 6   chromosome_name       19443 non-null  object
 7   transcript_start      19443 non-null  object
 8   transcript_end        19443 non-null  object
 9   transcript_length     19443 non-null  object
 10  gene_biotype          19443 non-null  object
 11  transcript_biotype    19443 non-null  object
 12  Transcript sequence   18260 non-null  object
 13  disease_associations  8883 non-null   object
dtypes: object(14)
memory usage: 3.0+ MB
None
<class 'pandas.core.frame.DataFrame'>
Index: 187

In [29]:
assert not luad_data.LncRNA.annotations["Transcript sequence"].str.contains("U").any()
assert not luad_data.MicroRNA.annotations["Transcript sequence"].str.contains("U").any()
assert not luad_data.MessengerRNA.annotations["Transcript sequence"].str.contains("U").any()

In [30]:
# # # import pickle
# with open('moge/data/luad_rna_ppi_data.pickle', 'wb') as file:
# #     luad_data = pickle.load(file)
#     pickle.dump(luad_data, file)

# Build Heterogeneous Network

In [2]:
import pickle
with open('moge/data/luad_rna_ppi_data.pickle', 'rb') as file:
    luad_data = pickle.load(file)
#     pickle.dump(luad_data, file)

In [5]:
# luad_data.Protein.annotations["gene_biotype"] = "protein"
# luad_data.Protein.annotations["Rfams"] = luad_data.Protein.annotations["annotation"].str.split(";", expand=True)[0]

In [3]:
from moge.network import HeterogeneousNetwork, MultiplexAttributedNetwork
from networkx import Graph, DiGraph

# network = HeterogeneousNetwork(multiomics=luad_data,
#                                modalities=["MicroRNA", "MessengerRNA", "LncRNA", "Protein"], )
network = MultiplexAttributedNetwork(multiomics=luad_data,
                                     modalities=["MicroRNA", "MessengerRNA", "LncRNA", "Protein"], 
                                     layers={
                                         (MicroRNA.name(), MessengerRNA.name()): DiGraph,
                                         (MicroRNA.name(), LncRNA.name()): DiGraph,
                                         (LncRNA.name(), MessengerRNA.name()): DiGraph,
                                         (Protein.name(), Protein.name()): Graph,
                                     })
# network.annotations.groupby("omic").count()

MicroRNA  nodes: 1870
MessengerRNA  nodes: 20472
LncRNA  nodes: 12727
Protein  nodes: 154
Total nodes: 35071
All annotation columns (union): {'transcript_start', 'annotation', 'Rfams', 'gene_biotype', 'transcript_length', 'feature', 'RNAcentral id', 'chromosome_name', 'transcript_end', 'start', 'havana_gene', 'tag', 'disease_associations', 'go_id', 'gene_id', 'transcript_name', 'RNA type', 'end', 'Transcript sequence', 'gene_name', 'protein_size', 'strand', 'mirbase id', 'protein_id', 'transcript_id', 'database', 'transcript_biotype'}
Annotation columns: ['go_id', 'disease_associations', 'Transcript sequence', 'omic']
INFO: Label go_id is split by delim '\||;' transformed by MultiLabelBinarizer
INFO: Label disease_associations is split by delim '\||;' transformed by MultiLabelBinarizer
INFO: Label omic is split by delim '\||;' transformed by MultiLabelBinarizer


In [7]:
network.add_edges(mirtarbase.get_interactions(nodelist=network.node_list, data=True), 
                  database=mirtarbase.name(), directed=True,
                  source=MicroRNA.name(), target=MessengerRNA.name()
                 )
network.add_edges(lncbase.get_interactions(nodelist=network.node_list, data=True), 
                  database=lncbase.name(), directed=True,
                  source=MicroRNA.name(), target=LncRNA.name()
                 )
network.add_edges(lncrna2target.get_interactions(nodelist=network.node_list, data=True), 
                  database=lncrna2target.name(), directed=True,
                  source=LncRNA.name(), target=MessengerRNA.name()
                 )
network.add_edges(string.get_interactions(network.node_list, data=True, inclusive=True), 
                  database=string.name(), directed=True,
                  source=Protein.name(), target=Protein.name()
                 )

335989 edges added to self.networks[(MicroRNA, MessengerRNA)]
19040 edges added to self.networks[(MicroRNA, LncRNA)]
7613 edges added to self.networks[(LncRNA, MessengerRNA)]
5412501 edges added to self.networks[(Protein, Protein)]


In [9]:
assert network.networks[('MicroRNA', 'LncRNA')].number_of_edges() > 0

19040

In [8]:
network.get_adjacency_matrix(edge_types=("LncRNA", "MessengerRNA"), node_list=None)

<35071x35071 sparse matrix of type '<class 'numpy.float64'>'
	with 7610 stored elements in Compressed Sparse Row format>

In [47]:
sampled_nodes = pd.DataFrame([node for node in nx.algorithms.bfs_beam_edges(network.G, "hsa-let-7d", lambda x: 1, width=10)])[1]
sampled_nodes

0          HMGA2
1            APP
2         DICER1
3        SLC11A2
4          PDGFA
          ...   
5506      MMADHC
5507    IRAK1BP1
5508      ARMCX2
5509       NELFE
5510       TAF10
Name: 1, Length: 5511, dtype: object

In [18]:
with open('moge/data/luad_rna_ppi_multiplex_network.pickle', "wb") as file:
#     network = pickle.load(file)
    pickle.dump(network, file)

In [23]:
network.split_stratified(stratify_label="go_id", stratify_omic=True, n_splits=6, 
                         dropna=False, verbose=True)

labels_filtered: 6921



Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.



test nodes 2576 , edges 8636
train nodes 12895 , edges 228235
removed 107754 edges, and  2931 nodes.
('MicroRNA', 'MessengerRNA') layer train_network 12895 228235
('MicroRNA', 'MessengerRNA') layer test_network 2576 8636
test nodes 439 , edges 405
train nodes 2350 , edges 11669
removed 7371 edges, and  690 nodes.
('MicroRNA', 'LncRNA') layer train_network 2350 11669
('MicroRNA', 'LncRNA') layer test_network 439 405
test nodes 1005 , edges 22
train nodes 5154 , edges 0
removed 7613 edges, and  1201 nodes.
('LncRNA', 'MessengerRNA') layer train_network 5154 0
('LncRNA', 'MessengerRNA') layer test_network 1005 22
test nodes 2927 , edges 140162
train nodes 14712 , edges 3727804
removed 1684697 edges, and  3233 nodes.
('Protein', 'Protein') layer train_network 14712 3727804
('Protein', 'Protein') layer test_network 2927 140162


# Visualize Network

In [53]:
nodelist = network.annotations[network.annotations["omic"].str.contains("MicroRNA|LncRNA")].index
nodelist = [k for k, v in network.G.subgraph(nodelist).degree() if v > 0 and k in nodelist]
len(nodelist)

3040

In [None]:
graph_viz(network.G, nodelist=nodelist, 
#           node_symbol=network.annotations.loc[nodelist, "disease_associations"], 
          node_color=network.annotations.loc[nodelist, "omic"], 
          edge_label="database",
          iterations=100,
          max_edges=3000)

# Add Attribute Affinity Positive Edges

In [None]:
affinities_GE = network.add_edges_from_nodes_similarity(modality="GE", node_list=network.nodes["GE"], 
    similarity_threshold=0.80, dissimilarity_threshold=0.01,
    negative_sampling_ratio=5.0, nanmean=True,
    features=["GO Terms", "Disease association"],
    weights=[1, 1],
    compute_correlation=False)

In [None]:
affinities_GE = network.add_edges_from_nodes_similarity(modality="GE", node_list=network.nodes["GE"], 
    similarity_threshold=0.85, dissimilarity_threshold=0.01,
    negative_sampling_ratio=5.0, nanmean=False,
    features=["locus_type", "gene_family_id", "location"], 
    weights=[0.5, 1, 0.5, 1],
    compute_correlation=True, tissue_expression=GE_tissue_exp)

In [None]:
affinities_MIR = network.add_edges_from_nodes_similarity(modality="MIR", node_list=network.nodes["MIR"], 
    similarity_threshold=0.70, dissimilarity_threshold=0.01,
    negative_sampling_ratio=5.0, nanmean=True,
    features=["GO Terms", "Disease association"], 
    compute_correlation=False)

In [None]:
affinities_MIR = network.add_edges_from_nodes_similarity(modality="MIR", node_list=network.nodes["MIR"], 
    similarity_threshold=0.70, dissimilarity_threshold=0.01,
    negative_sampling_ratio=5.0, nanmean=True,
    features=["Family", "Rfams"], 
    compute_correlation=True, tissue_expression=MIR_tissue_exp)

In [None]:
affinities_LNC = network.add_edges_from_nodes_similarity(modality="LNC", node_list=network.nodes["LNC"], 
    similarity_threshold=0.90, dissimilarity_threshold=0.1,
    negative_sampling_ratio=5.0, 
    nanmean=False,
    features=["locus_type", "Transcript type", "tag", "Strand", "Chromosome"],
    weights=[1, 1, 0.5, 0.5, 1, 1],
    compute_correlation=True, tissue_expression=LNC_tissue_exp)

In [None]:
affinities_LNC = network.add_edges_from_nodes_similarity(modality="LNC", node_list=network.nodes["LNC"], 
    similarity_threshold=0.70, dissimilarity_threshold=0.01,
    negative_sampling_ratio=5.0, nanmean=True,
    features=["GO Terms", "Family", "Disease association"], 
    compute_correlation=False)

In [None]:
matrix_heatmap(network.get_adjacency_matrix(edge_types=["u"], node_list=network.nodes["GE"]).todense(),
              figsize=(7,7))

In [None]:
matrix_heatmap(network.get_adjacency_matrix(edge_types=["u"], node_list=network.nodes["MIR"]).todense(),
              figsize=(5,5))

In [None]:
matrix_heatmap(network.get_adjacency_matrix(edge_types=["u"], node_list=network.nodes["LNC"]).todense(),
              figsize=(7,7))

In [None]:
# # IMPORT Affinity Edgelist
# network.import_edgelist_file(
# #     file="moge/data/LMN_future_recall/TRAIN/Interactions_Affinity/lmn_n70_m70_l70-70_TissueExp_GO_Rfams_Disease_Family_GO_affinity.edgelist", 
#     file="moge/data/LMN_future_recall/TRAIN/Interactions_Only/GE/lmn_train.BioGRID.interactions.edgelist", 
#     is_directed=True)

# network.import_edgelist_file(
#     file="moge/data/LMN_future_recall/TRAIN/Interactions_Only/MIR/lmn_train.miRTarBase.interactions.edgelist", 
#     is_directed=True)

# network.import_edgelist_file(
#     file="moge/data/LMN_future_recall/TRAIN/Interactions_Only/LNC/lmn_train.lncBase.interactions.edgelist", 
#     is_directed=True)

# network.import_edgelist_file(
#     file="moge/data/LMN_future_recall/TRAIN/Interactions_Only/LNC/lmn_train.lncrna2target.interactions.edgelist", 
#     is_directed=True)

In [None]:
# WRITE Affinity Edgelist
# nx.write_edgelist(network.get_subgraph(["MIR", "GE", "LNC"], edge_type="u"), 
#                   "moge/data/LMN_future_recall/TRAIN/Interactions_Affinity/lmn_n70_m70_l70-70_TissueExp_GO_Rfams_Disease_Family_GO_affinity.edgelist", 
#                   data=True)

# Add Attribute Affinity NEGATIVE Edges between cross-modals

In [None]:
u_n_size = 10000
network.add_sampled_negative_edges(u_n_size, modalities=["GE", "MIR"])
network.add_sampled_negative_edges(u_n_size*2, modalities=["GE", "LNC"])
network.add_sampled_negative_edges(u_n_size, modalities=["LNC", "MIR"])

# miRTarBase

In [None]:
mirtarbase_new = luad_data.MIR.get_miRTarBase_miRNA_target_interaction(use_latest=True, data=False, rename_dict=case_relabel)
mirtarbase_old = luad_data.MIR.get_miRTarBase_miRNA_target_interaction(use_latest=False, data=False, rename_dict=case_relabel)
mirtarbase_diff = list(set(mirtarbase_new) - set(mirtarbase_old))
len(mirtarbase_diff)

In [None]:
# Adds miRNA-target interaction network
network.add_directed_edges_from_edgelist(edgelist=luad_data.MIR.get_miRTarBase_miRNA_target_interaction(use_latest=False),
#                                                                                                        rename_dict=case_relabel), 
                                        modalities=["MIR", "GE"], correlation_weights=False, threshold=0.20,
                                        database="miRTarBase")

# StarBase mRNA-RNA

In [None]:
starbase_new = luad_data.GE.get_starBase_RNA_RNA_interactions(min_expNum=1, data=False)
starbase_old = luad_data.GE.get_starBase_RNA_RNA_interactions(min_expNum=2, data=False)
starbase_diff = list(set(starbase_new) - set(starbase_old))
len(starbase_diff)

In [None]:
# Adds Gene Regulatory Network edges
network.add_directed_edges_from_edgelist(edgelist=luad_data.LNC.get_starBase_lncRNA_RNA_interactions(),
                                        modalities=["LNC", "GE"], correlation_weights=False, threshold=0.2,
                                        database="starBase")

# BioGRID

In [None]:
biogrid_new = luad_data.GE.get_BioGRID_GRN_edgelist(data=False, rename_dict=case_relabel,
                biogrid_interactions_file_path='/data/datasets/Bioinformatics_ExternalData/BioGRID/BIOGRID-ALL-3.5.169.tab2.txt')
biogrid_old = luad_data.GE.get_BioGRID_GRN_edgelist(data=False, rename_dict=case_relabel,
                biogrid_interactions_file_path='/data/datasets/Bioinformatics_ExternalData/BioGRID/BIOGRID-ALL-3.4.162.tab2.txt')
biogrid_diff = list(set(biogrid_new) - set(biogrid_old))
len(biogrid_diff)

In [None]:
# Adds Gene Regulatory Network edges
network.add_directed_edges_from_edgelist(edgelist=luad_data.GE.get_BioGRID_GRN_edgelist(biogrid_interactions_file_path='/data/datasets/Bioinformatics_ExternalData/BioGRID/BIOGRID-ALL-3.4.162.tab2.txt'),
#                                                                                        rename_dict=case_relabel),
                                        modalities=["GE", "GE"], correlation_weights=False, threshold=0.2,
                                        database="BioGRID")

# NPInter

In [None]:
from moge.network.heterogeneous_network import get_rename_dict
noncode_rename_dict = pd.Series(luad_data.LNC.noncode_func_df["Gene Name"].values,
     index=luad_data.LNC.noncode_func_df["NONCODE Gene ID"].str.split(".", expand=True)[0]).to_dict()
noncode_rename_dict = {k: noncode_rename_dict[k] for k in noncode_rename_dict if type(noncode_rename_dict[k])!=float}

lncbase_rename_dict = get_rename_dict(luad_data.LNC.get_genes_info(), "Gene ID")
lncbase_rename_dict.update(noncode_rename_dict)

In [None]:
# lncbase_rename_dict.update(case_relabel)

In [None]:
npinter_new = luad_data.LNC.get_NPInter_ncRNA_RNA_regulatory_interaction_edgelist(use_latest=True, data=False,
                                                                                 rename_dict=lncbase_rename_dict)
npinter_old = luad_data.LNC.get_NPInter_ncRNA_RNA_regulatory_interaction_edgelist(use_latest=False, data=False,
                                                                                 rename_dict=lncbase_rename_dict)
npinter_diff = list(set(npinter_new) - set(npinter_old))
len(npinter_diff)

In [None]:
network.add_directed_edges_from_edgelist(edgelist=luad_data.LNC.get_NPInter_ncRNA_RNA_regulatory_interaction_edgelist(use_latest=False), 
#                                                                                                                       rename_dict=lncbase_rename_dict),
                                         modalities=["LNC", "GE"], correlation_weights=False, threshold=0.20,
                                        database="NPInter")

# lncBase

In [None]:
lncbase_new = luad_data.LNC.get_lncBase_miRNA_lncRNA_predicted_interactions_edgelist(rename_dict=lncbase_rename_dict,
                                                                                     data=False)
lncbase_old = luad_data.LNC.get_lncBase_miRNA_lncRNA_interactions_edgelist(rename_dict=lncbase_rename_dict, 
                                                                           data=False)
lncbase_diff = list(set(lncbase_new) - set(lncbase_old))
print(len(lncbase_diff))

In [None]:
# Adds miRNA-lncRNA interaction network
# network.add_directed_edges_from_edgelist(edgelist=luad_data.LNC.get_starBase_lncRNA_miRNA_interactions_edgelist(),
#                                         )
# network.add_directed_edges_from_edgelist(edgelist=luad_data.LNC.get_lncRNome_miRNA_binding_sites_edgelist(),
#                                          modalities=["LNC", "MIR"], correlation_weights=False, threshold=0.20,
#                                         database="lncRNome")

network.add_directed_edges_from_edgelist(edgelist=luad_data.LNC.get_lncBase_miRNA_lncRNA_interactions_edgelist(rename_dict=lncbase_rename_dict),
                                         modalities=["MIR", "LNC"], correlation_weights=False, threshold=0.20,
                                        database="lncBase")


# network.add_directed_edges_from_edgelist(edgelist=luad_data.LNC.get_LncReg_lncRNA_RNA_regulatory_interactions(),
#                                          modalities=["LNC", "GE"], correlation_weights=False, threshold=0.20,
#                                         database="LncReg")

# lncrna2target

In [None]:
lnc2tar_low = luad_data.LNC.get_lncrna2target_low_throughput_interactions(data=False, rename_dict=lncbase_rename_dict)
lnc2tar_high = luad_data.LNC.get_lncrna2target_high_throughput_interactions(data=False, rename_dict=lncbase_rename_dict)
lnc2tar_diff = list(set(lnc2tar_high) - set(lnc2tar_low))
len(lnc2tar_diff)

In [None]:
network.add_directed_edges_from_edgelist(edgelist=luad_data.LNC.get_lncrna2target_high_throughput_interactions(rename_dict=lncbase_rename_dict),
                                         modalities=["LNC", "GE"], correlation_weights=False, threshold=0.20,
                                        database="lncrna2target")

# network.add_directed_edges_from_edgelist(edgelist=luad_data.LNC.get_lncrna2target_low_throughput_interactions(rename_dict=lncbase_rename_dict),
#                                          modalities=["LNC", "GE"], correlation_weights=False, threshold=0.20,
#                                         database="lncrna2target")

# network.add_directed_edges_from_edgelist(edgelist=luad_data.LNC.get_lncRInter_interactions(),
#                                          modalities=["LNC", "GE"], correlation_weights=False, threshold=0.20,
#                                         database="lncRInter")

In [None]:
npinter_val = {(u,v) for u,v,d in network.G.edges(data=True) if "database" in d and d["database"]=="NPInter"}
len(npinter_val)
npinter_train = {(u,v) for u,v,d in network.G.edges(data=True) if "database" in d and d["database"]=="NPInter"}
len(npinter_train)
len(lncbase_train & l2t_val)

# Filter the node_list

In [None]:
node_pairs = {(node1, node2) for node1 in network.G.nodes for node2 in network.G.nodes if\
              node1!=node2 and node2.lower()==node1.lower()}
len(node_pairs)

In [None]:
case_relabel = {}
for u,v in node_pairs:
    if v in network.genes_info.index:
        case_relabel[u] = v
    elif u in network.genes_info.index:
        case_relabel[v] = u
len(case_relabel)

In [None]:
len([node for node in network.G.nodes if node not in network.genes_info.index])

In [None]:
isolates = list(nx.isolates(network.G))
len(isolates)

In [None]:
network.G.remove_nodes_from(isolates)

In [None]:
network.nodes["MessengerRNA"] = [node for node in network.nodes["MessengerRNA"] if node in network.G.nodes()]
network.nodes["LncRNA"] = [node for node in network.nodes["LncRNA"] if node in network.G.nodes()]
network.nodes["MicroRNA"] = [node for node in network.nodes["MicroRNA"] if node in network.G.nodes()]

In [None]:
network.G.number_of_nodes(), network.G.number_of_edges()

In [None]:
len(network.node_list)

In [None]:
# WRITE/READ FULL network
import pickle
with open('moge/data/LMN_future_recall/TRAIN/Interactions_Only/LMN_lncbase_mirtarbase_biogrid_lncrna2target_openomics.train.pickle', 
          'wb') as file:
    pickle.dump(network, file)
#     network = pickle.load(file)

# Train test split

In [None]:
network.split_train_test_nodes(node_list=network.node_list, verbose=True,)

In [None]:
train_generator = network.get_train_generator()

In [None]:
test_generator = network.get_test_generator()

## Network Info & Visualizations

In [None]:
print(len(network.G.nodes()))
print(len(network.G.edges()))

In [None]:
csr_un = network.get_adjacency_matrix(edge_types=["u_n"], node_list=network.node_list)
csr_u = network.get_adjacency_matrix(edge_types=["u"], node_list=network.node_list)
csr_d = network.get_adjacency_matrix(edge_types=["d"], node_list=network.node_list)

In [None]:
matrix_heatmap(csr_un.toarray())

In [None]:
matrix_heatmap(csr_u.toarray())

In [None]:
matrix_heatmap(csr_d.toarray())