In [1]:
import pandas as pd
from tqdm.notebook import tqdm
import numpy as np
from IPython.display import display
import requests, sys, os
import omnipath as op

# Functions

In [2]:
def check_dir(dir: str):
    if os.path.exists(dir) and os.path.isdir(dir):
        pass
    else:
        os.makedirs(dir)

In [3]:
def string_protein_search(string_ids: list, hgnc_table: pd.DataFrame):
    
    proteins = {prot: prot.split('.')[1] for prot in string_ids}
    server = "https://rest.ensembl.org"
    ext = "/lookup/id"
    headers={ "Content-Type" : "application/json", "Accept" : "application/json"}
    
    # The first query returned transcript symbols
    r = requests.post(server+ext, headers=headers, json={'ids': list(proteins.values())})
    if not r.ok:
        r.raise_for_status()
        sys.exit()
    
    decoded = r.json()
    
    transcripts = {}
    for k, v in proteins.items():
        if decoded[v] is not None:
            transcripts[k] = decoded[v]['Parent']
    
    # The second query will return gene symbols
    r = requests.post(server+ext, headers=headers, json={'ids': list(transcripts.values())})
    if not r.ok:
        r.raise_for_status()
        sys.exit()
    
    decoded = r.json()
    
    genes = []
    for k, v in transcripts.items():
        if decoded[v] is not None:
            genes.append((k, decoded[v]['Parent']))
    
    genes = pd.DataFrame(genes, columns=['string_ids', 'ensembl_gene_id'])
    genes = pd.merge(genes, hgnc_table)[['string_ids', 'symbol']]

    return genes

# Data

In [4]:
datadir = "data/"
check_dir(datadir)

In [5]:
hgnc = pd.read_table('hgnc_complete_set.txt', header=0, dtype='string')
hgnc.head()

Unnamed: 0,hgnc_id,symbol,name,locus_group,locus_type,status,location,location_sortable,alias_symbol,alias_name,...,cd,lncrnadb,enzyme_id,intermediate_filament_db,rna_central_ids,lncipedia,gtrnadb,agr,mane_select,gencc
0,HGNC:5,A1BG,alpha-1-B glycoprotein,protein-coding gene,gene with protein product,Approved,19q13.43,19q13.43,,,...,,,,,,,,HGNC:5,ENST00000263100.8|NM_130786.4,
1,HGNC:37133,A1BG-AS1,A1BG antisense RNA 1,non-coding RNA,"RNA, long non-coding",Approved,19q13.43,19q13.43,FLJ23569,,...,,,,,,A1BG-AS1,,HGNC:37133,,
2,HGNC:24086,A1CF,APOBEC1 complementation factor,protein-coding gene,gene with protein product,Approved,10q11.23,10q11.23,ACF|ASP|ACF64|ACF65|APOBEC1CF,,...,,,,,,,,HGNC:24086,ENST00000373997.8|NM_014576.4,
3,HGNC:7,A2M,alpha-2-macroglobulin,protein-coding gene,gene with protein product,Approved,12p13.31,12p13.31,FWP007|S863-7|CPAMD5,,...,,,,,,,,HGNC:7,ENST00000318602.12|NM_000014.6,HGNC:7
4,HGNC:27057,A2M-AS1,A2M antisense RNA 1,non-coding RNA,"RNA, long non-coding",Approved,12p13.31,12p13.31,,,...,,,,,,A2M-AS1,,HGNC:27057,,


In [6]:
hgnc.columns

Index(['hgnc_id', 'symbol', 'name', 'locus_group', 'locus_type', 'status',
       'location', 'location_sortable', 'alias_symbol', 'alias_name',
       'prev_symbol', 'prev_name', 'gene_group', 'gene_group_id',
       'date_approved_reserved', 'date_symbol_changed', 'date_name_changed',
       'date_modified', 'entrez_id', 'ensembl_gene_id', 'vega_id', 'ucsc_id',
       'ena', 'refseq_accession', 'ccds_id', 'uniprot_ids', 'pubmed_id',
       'mgd_id', 'rgd_id', 'lsdb', 'cosmic', 'omim_id', 'mirbase', 'homeodb',
       'snornabase', 'bioparadigms_slc', 'orphanet', 'pseudogene.org',
       'horde_id', 'merops', 'imgt', 'iuphar', 'kznf_gene_catalog',
       'mamit-trnadb', 'cd', 'lncrnadb', 'enzyme_id',
       'intermediate_filament_db', 'rna_central_ids', 'lncipedia', 'gtrnadb',
       'agr', 'mane_select', 'gencc'],
      dtype='object')

In [7]:
# Process ids to map
genesymbols = hgnc.symbol.drop_duplicates()
print(len(genesymbols))
display(genesymbols.head(1))

ensembl_ids = (hgnc
               .set_index("symbol").ensembl_gene_id.dropna()
               .drop_duplicates(keep=False).reset_index()
               .set_index('ensembl_gene_id')
              )
print(len(ensembl_ids))
display(ensembl_ids.head(1))

uniprot_ids = (hgnc
               .set_index("symbol").uniprot_ids.dropna().str.split("|")
               .explode().drop_duplicates(keep=False).reset_index()
               .set_index("uniprot_ids")
               )
print(len(uniprot_ids))
display(uniprot_ids.head(1))

prev_symbols = (hgnc
               .set_index("symbol").prev_symbol.dropna().str.split("|")
               .explode().drop_duplicates(keep=False).reset_index()
               .set_index("prev_symbol")
               )
print(len(prev_symbols))
prev_symbols.head(1)

43723


0    A1BG
Name: symbol, dtype: string

41042


Unnamed: 0_level_0,symbol
ensembl_gene_id,Unnamed: 1_level_1
ENSG00000121410,A1BG


20167


Unnamed: 0_level_0,symbol
uniprot_ids,Unnamed: 1_level_1
P04217,A1BG


15317


Unnamed: 0_level_0,symbol
prev_symbol,Unnamed: 1_level_1
NCRNA00181,A1BG-AS1


# HuRI

In [8]:
huri = pd.read_csv('HI-union.tsv', sep='\t', header=None, names=['Ensembl ID A', 'Ensembl ID B'])
huri.head()

Unnamed: 0,Ensembl ID A,Ensembl ID B
0,ENSG00000000005,ENSG00000061656
1,ENSG00000000005,ENSG00000099968
2,ENSG00000000005,ENSG00000104765
3,ENSG00000000005,ENSG00000105383
4,ENSG00000000005,ENSG00000114455


In [9]:
print(len(huri))
huri_graph = (huri
              .merge(ensembl_ids, left_on="Ensembl ID A", right_index=True)
              .merge(ensembl_ids, left_on="Ensembl ID B", right_index=True)
              [["symbol_x", "symbol_y"]].rename(columns={"symbol_x": "protein_A", "symbol_y": "protein_B"})
              .reset_index(drop=True)
             )
print(len(huri_graph))
huri_graph.head(2)

64006
63584


Unnamed: 0,protein_A,protein_B
0,TNMD,SPAG4
1,TNMD,BCL2L13


In [10]:
# Mostrar os resultados
genelist = set()
for col in huri_graph.columns:
    genes = huri_graph[col].unique().tolist()
    print(f"Proteins in column {col.split('_')[1]}: {len(genes)}")
    genelist |= set(genes)

print(f"Number of proteins (nodes): {len(genelist)}")
print(f"Number of interactions (edges): {len(huri_graph)}")

Proteins in column A: 6888
Proteins in column B: 7124
Number of proteins (nodes): 9050
Number of interactions (edges): 63584


In [11]:
huri_graph.to_csv(datadir+'huri_graph.csv', index=False)

# Apid

In [12]:
apid = pd.read_table("9606_noISI_Q2 (1).txt", header=0, dtype="str")
print(len(apid))
apid.head(2)

265216


Unnamed: 0,InteractionID,UniprotID_A,UniprotName_A,GeneName_A,UniprotID_B,UniprotName_B,GeneName_B,ExpEvidences,Methods,Publications,3DStructures,CurationEvents
0,1205000,Q14160,SCRIB_HUMAN,SCRIB,B7Z2Y1,B7Z2Y1_HUMAN,,1,1,1,0,3
1,1205001,Q14160,SCRIB_HUMAN,SCRIB,Q14155,ARHG7_HUMAN,ARHGEF7,11,8,8,0,20


In [13]:
ids = pd.concat([
    apid[[f"GeneName_{col}", f"UniprotID_{col}"]].rename(columns={f"GeneName_{col}": "GeneName", f"UniprotID_{col}": "UniprotID"})
    for col in ["A", "B"]
]).drop_duplicates(ignore_index=True)

print(len(ids))
ids.head(2)

18173


Unnamed: 0,GeneName,UniprotID
0,SCRIB,Q14160
1,KCNA5,P22460


In [14]:
hgnc_map = (ids
            .merge(genesymbols, how="left", left_on="GeneName", right_on="symbol")
            .merge(uniprot_ids, how="left", left_on="UniprotID", right_index=True, suffixes=["_1", "_2"])
            .merge(prev_symbols, how="left", left_on="GeneName", right_index=True)
           )

# select mappings in order
hgnc_map = pd.concat([
    hgnc_map[["GeneName", "UniprotID", "symbol_1"]].dropna(subset="symbol_1").rename(columns={"symbol_1": "symbol"}),
    hgnc_map.loc[hgnc_map.symbol_1.isna(), ["GeneName", "UniprotID", "symbol_2"]].dropna(subset="symbol_2").rename(columns={"symbol_2": "symbol"}),
    hgnc_map.loc[(hgnc_map.symbol_1.isna())&(hgnc_map.symbol_2.isna()), ["GeneName", "UniprotID", "symbol"]].dropna()
]).set_index("UniprotID").symbol

print(len(hgnc_map))
print(len(hgnc_map.index.drop_duplicates()))

17817
17817


In [15]:
# Uniprot IDs are unique and so can be used to merge symbols with graph
apid_graph = (apid[["UniprotID_A", "UniprotID_B"]]
              .merge(hgnc_map, left_on="UniprotID_A", right_index=True)
              .merge(hgnc_map, left_on="UniprotID_B", right_index=True, suffixes=["_A", "_B"])
              [["symbol_A", "symbol_B"]].rename(columns={"symbol_A": "protein_A", "symbol_B": "protein_B"})
             ).drop_duplicates()
print(len(apid_graph))
apid_graph.head(2)

262291


Unnamed: 0,protein_A,protein_B
1,SCRIB,ARHGEF7
2,SCRIB,NET1


In [16]:
# Mostrar os resultados
genelist = set()
for col in apid_graph.columns:
    genes = apid_graph[col].unique().tolist()
    print(f"Proteins in column {col.split('_')[1]}: {len(genes)}")
    genelist |= set(genes)

print(f"Number of proteins (nodes): {len(genelist)}")
print(f"Number of interactions (edges): {len(apid_graph)}")

Proteins in column A: 14354
Proteins in column B: 15915
Number of proteins (nodes): 16994
Number of interactions (edges): 262291


In [17]:
apid_graph.to_csv(datadir+"apid_graph.csv", index=False)

# Biogrid

In [18]:
biogrid = pd.read_table('BIOGRID-ORGANISM-Homo_sapiens-4.4.225.tab3.txt', dtype="str")
biogrid.head(2)

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,TREMBL Accessions Interactor B,REFSEQ Accessions Interactor B,Ontology Term IDs,Ontology Term Names,Ontology Term Categories,Ontology Term Qualifier IDs,Ontology Term Qualifier Names,Ontology Term Types,Organism Name Interactor A,Organism Name Interactor B
0,103,6416,2318,112315,108607,-,-,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,...,Q59H94,NP_001120959|NP_001449,-,-,-,-,-,-,Homo sapiens,Homo sapiens
1,117,84665,88,124185,106603,-,-,MYPN,ACTN2,CMD1DD|CMH22|MYOP|RCM4,...,Q59FD9|F6THM6,NP_001094|NP_001265272|NP_001265273,-,-,-,-,-,-,Homo sapiens,Homo sapiens


In [19]:
# interactor ID columns
biogrid.loc[:, biogrid.columns.str.contains("Interactor A")].head(2)

Unnamed: 0,Entrez Gene Interactor A,BioGRID ID Interactor A,Systematic Name Interactor A,Official Symbol Interactor A,Synonyms Interactor A,Organism ID Interactor A,SWISS-PROT Accessions Interactor A,TREMBL Accessions Interactor A,REFSEQ Accessions Interactor A,Organism Name Interactor A
0,6416,112315,-,MAP2K4,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,9606,P45985,-,NP_003001|NP_001268364,Homo sapiens
1,84665,124185,-,MYPN,CMD1DD|CMH22|MYOP|RCM4,9606,Q86TC9,A0A087WX60,NP_001243197|NP_001243196|NP_115967,Homo sapiens


In [20]:
biogrid_cols = biogrid.loc[:,
    biogrid.columns.str.contains("Official Symbol Interactor")|
    biogrid.columns.str.contains("SWISS-PROT Accessions Interactor")
].rename(columns={
    "Official Symbol Interactor A": "genesymbol_A",
    "Official Symbol Interactor B": "genesymbol_B",
    "SWISS-PROT Accessions Interactor A": "uniprotid_A",
    "SWISS-PROT Accessions Interactor B": "uniprotid_B",
})
biogrid_cols.head()

Unnamed: 0,genesymbol_A,genesymbol_B,uniprotid_A,uniprotid_B
0,MAP2K4,FLNC,P45985,Q14315
1,MYPN,ACTN2,Q86TC9,P35609
2,ACVR1,FNTA,Q04771,P49354
3,GATA2,PML,P23769,P29590
4,RPA2,STAT3,P15927,P40763


In [21]:
ids = pd.concat([
    (biogrid_cols[[f"genesymbol_{col}", f"uniprotid_{col}"]]
     .rename(columns={f"genesymbol_{col}": "GeneName", f"uniprotid_{col}": "UniprotID"})
    ) for col in ["A", "B"]
]).drop_duplicates(ignore_index=True)

print(len(ids))
ids.head(2)

27965


Unnamed: 0,GeneName,UniprotID
0,MAP2K4,P45985
1,MYPN,Q86TC9


In [22]:
hgnc_map = (ids
            .merge(genesymbols, how="left", left_on="GeneName", right_on="symbol")
            .merge(uniprot_ids, how="left", left_on="UniprotID", right_index=True, suffixes=["_1", "_2"])
            .merge(prev_symbols, how="left", left_on="GeneName", right_index=True)
           )

# select mappings in order
hgnc_map = pd.concat([
    hgnc_map[["GeneName", "UniprotID", "symbol_1"]].dropna(subset="symbol_1").rename(columns={"symbol_1": "symbol"}),
    hgnc_map.loc[hgnc_map.symbol_1.isna(), ["GeneName", "UniprotID", "symbol_2"]].dropna(subset="symbol_2").rename(columns={"symbol_2": "symbol"}),
    hgnc_map.loc[(hgnc_map.symbol_1.isna())&(hgnc_map.symbol_2.isna()), ["GeneName", "UniprotID", "symbol"]].dropna()
]).set_index(["GeneName", "UniprotID"]).symbol

print(len(hgnc_map))
print(len(hgnc_map.index.drop_duplicates()))
hgnc_map.head(2)

20341
20341


GeneName  UniprotID
MAP2K4    P45985       MAP2K4
MYPN      Q86TC9         MYPN
Name: symbol, dtype: string

In [23]:
# Uniprot IDs are unique and so can be used to merge symbols with graph
biogrid_graph = (biogrid_cols
                 .merge(hgnc_map, left_on=["genesymbol_A", "uniprotid_A"], right_index=True)
                 .merge(hgnc_map, left_on=["genesymbol_B", "uniprotid_B"], right_index=True, suffixes=["_A", "_B"])
                 [["symbol_A", "symbol_B"]].rename(columns={"symbol_A": "protein_A", "symbol_B": "protein_B"})
                ).drop_duplicates()
print(len(biogrid_graph))
biogrid_graph.head(2)

868997


Unnamed: 0,protein_A,protein_B
0,MAP2K4,FLNC
1,MYPN,ACTN2


In [24]:
# Mostrar os resultados
genelist = set()
for col in biogrid_graph.columns:
    genes = biogrid_graph[col].unique().tolist()
    print(f"Proteins in column {col.split('_')[1]}: {len(genes)}")
    genelist |= set(genes)

print(f"Number of proteins (nodes): {len(genelist)}")
print(f"Number of interactions (edges): {len(biogrid_graph)}")

Proteins in column A: 16391
Proteins in column B: 19021
Number of proteins (nodes): 19694
Number of interactions (edges): 868997


In [25]:
biogrid_graph.to_csv(datadir+'biogrid_graph.csv', index=False)

# String

In [26]:
string_phys_ppi = pd.read_csv("9606.protein.physical.links.v12.0.txt", sep=" ")
string_phys_ppi = string_phys_ppi[string_phys_ppi['combined_score'] > 400]
print(len(string_phys_ppi))
string_phys_ppi.head()

414048


Unnamed: 0,protein1,protein2,combined_score
2,9606.ENSP00000000233,9606.ENSP00000434442,499
3,9606.ENSP00000000233,9606.ENSP00000262455,531
4,9606.ENSP00000000233,9606.ENSP00000303145,499
10,9606.ENSP00000000233,9606.ENSP00000249923,565
14,9606.ENSP00000000233,9606.ENSP00000264028,554


In [27]:
# STRING provides an ID mapping table between Ensembl protein IDs and HGNC gene symbols
string_aliases = pd.read_table("9606.protein.aliases.v12.0.txt")
string_aliases.head(2)

Unnamed: 0,#string_protein_id,alias,source
0,9606.ENSP00000000233,2B6H,Ensembl_PDB
1,9606.ENSP00000000233,2B6H,UniProt_DR_PDB


In [28]:
string_ids = pd.concat([string_phys_ppi.protein1, string_phys_ppi.protein2]).drop_duplicates(ignore_index=True).rename("string_ids")
print(len(string_ids))

15773


In [29]:
# we could use hgnc_ids, which are also available
#hgnc_ids = string_aliases.loc[string_aliases.source=="Ensembl_HGNC_hgnc_id", ["#string_protein_id", "alias"]]
#print(len(hgnc_ids))
#hgnc_ids.head(2)

In [30]:
ensembl_string_ids = string_aliases.loc[string_aliases.source=="Ensembl_HGNC_ensembl_gene_id", ["#string_protein_id", "alias"]]
print(len(ensembl_string_ids))
ensembl_string_ids.head(2)

19200


Unnamed: 0,#string_protein_id,alias
49,9606.ENSP00000000233,ENSG00000004059
154,9606.ENSP00000000412,ENSG00000003056


In [31]:
print(len(string_ids))
string_map = (ensembl_string_ids.set_index("alias")
              .merge(ensembl_ids, left_index=True, right_index=True)
              .reset_index(drop=True)
              .set_index("#string_protein_id").merge(string_ids, left_index=True, right_on="string_ids", how="right")
             )
print(len(string_map))
string_map[string_map.string_ids.duplicated(keep=False)]

15773
15774


Unnamed: 0,symbol,string_ids
14835,CCL3L3,9606.ENSP00000480558
14835,CCL3L1,9606.ENSP00000480558


In [32]:
# unmapped ids
unmapped = string_map.loc[string_map.symbol.isna(), "string_ids"].unique().tolist()
print(len(unmapped))
# duplicates
dups = string_map.loc[string_map.string_ids.duplicated(keep=False), "string_ids"].unique().tolist()
dups_index = string_map.index[string_map.string_ids.duplicated()]
print(len(dups))

231
1


In [33]:
found_dups = string_protein_search(dups, hgnc)
print(len(found_dups))
display(found_dups.head(2))

found_unmapped = string_protein_search(unmapped, hgnc)
print(len(found_unmapped))
display(found_unmapped.head(2))

1


Unnamed: 0,string_ids,symbol
0,9606.ENSP00000480558,CCL3L3


102


Unnamed: 0,string_ids,symbol
0,9606.ENSP00000040584,HOXC8
1,9606.ENSP00000216487,RIN3


In [34]:
string_map_full = pd.concat([string_map.dropna().drop(dups_index), found_dups, found_unmapped], ignore_index=True)
print(len(string_map_full))
string_map_full.head(2)

15644


Unnamed: 0,symbol,string_ids
0,ARF5,9606.ENSP00000000233
1,M6PR,9606.ENSP00000000412


In [35]:
print(len(string_phys_ppi))
string_graph = (string_phys_ppi
                .merge(string_map_full, left_on="protein1", right_on="string_ids")
                .merge(string_map_full, left_on="protein2", right_on="string_ids", suffixes=["_A", "_B"])
                .dropna()
                [["symbol_A", "symbol_B"]]
               )
print(len(string_graph))
string_graph.head(2)

414048
410424


Unnamed: 0,symbol_A,symbol_B
0,ARF5,ARFGAP2
1,ARF5,ERP44


In [36]:
# Mostrar os resultados
genelist = set()
for col in string_graph.columns:
    genes = string_graph[col].unique().tolist()
    print(f"Proteins in column {col.split('_')[1]}: {len(genes)}")
    genelist |= set(genes)

print(f"Number of proteins (nodes): {len(genelist)}")
print(f"Number of interactions (edges): {len(string_graph)}")

Proteins in column A: 15632
Proteins in column B: 15632
Number of proteins (nodes): 15632
Number of interactions (edges): 410424


In [37]:
string_graph.to_csv(datadir+'string_graph.csv', index=False)

# Omnipath 

In [38]:
omnipath = op.interactions.PostTranslational.get(
    directed=True,
    genesymbols=True,
    organism="human"
)
print(len(omnipath))
omnipath.head()

134282


Unnamed: 0,source,target,source_genesymbol,target_genesymbol,is_directed,is_stimulation,is_inhibition,consensus_direction,consensus_stimulation,consensus_inhibition,curation_effort,references,sources,n_sources,n_primary_sources,n_references,references_stripped
0,P0DP23,P48995,CALM1,TRPC1,True,False,True,True,False,True,3,TRIP:11290752;TRIP:11983166;TRIP:12601176,TRIP,1,1,3,11290752;11983166;12601176
1,P0DP25,P48995,CALM3,TRPC1,True,False,True,True,False,True,3,TRIP:11290752;TRIP:11983166;TRIP:12601176,TRIP,1,1,3,11290752;11983166;12601176
2,P0DP24,P48995,CALM2,TRPC1,True,False,True,True,False,True,3,TRIP:11290752;TRIP:11983166;TRIP:12601176,TRIP,1,1,3,11290752;11983166;12601176
3,Q03135,P48995,CAV1,TRPC1,True,True,False,True,True,False,13,DIP:19897728;HPRD:12732636;IntAct:19897728;Lit...,DIP;HPRD;IntAct;Lit-BM-17;TRIP,5,5,8,10980191;12732636;14551243;16822931;18430726;1...
4,P14416,P48995,DRD2,TRPC1,True,True,False,True,True,False,1,TRIP:18261457,TRIP,1,1,1,18261457


In [39]:
# verificar se existem complexos proteicos na rede
omnipath.loc[
    omnipath["source_genesymbol"].str.contains(r'[:_]')|
    omnipath["target_genesymbol"].str.contains(r'[:_]'),
    ['source_genesymbol', 'target_genesymbol']
].head(5)

Unnamed: 0,source_genesymbol,target_genesymbol
147,PPP2CA_PPP2R5A_PTPA,RBL1
182,PRKAA1_PRKAA2_PRKAB1_PRKAB2_PRKAG1_PRKAG2_PRKAG3,CRTC2
235,CSNK2A1_CSNK2B,H4C2
247,ATF2_JUN,SELE
252,BTRC_CUL1_SKP1,PER2


In [40]:
# some complexes have a genesymbol with an "_"
omnipath.loc[
(omnipath["source_genesymbol"].str.contains("GTF2H2C_2"))|
(omnipath["target_genesymbol"].str.contains("GTF2H2C_2")),
['source_genesymbol', 'target_genesymbol']
]

Unnamed: 0,source_genesymbol,target_genesymbol
11321,CCNH_CDK7_ERCC2_ERCC3_GTF2H1_GTF2H2_GTF2H2C_2_...,NR5A1
14980,CCNH_CDK7_ERCC2_ERCC3_GTF2H1_GTF2H2_GTF2H2C_2_...,POLR2A
50987,CCNH_CDK7_ERCC2_ERCC3_GTF2H1_GTF2H2_GTF2H2C_2_...,AR
51630,GTF2E1_GTF2E2,CCNH_CDK7_ERCC2_ERCC3_GTF2H1_GTF2H2_GTF2H2C_2_...
53736,CCNH_CDK7_ERCC2_ERCC3_GTF2H1_GTF2H2_GTF2H2C_2_...,E2F1
55039,CCNH_CDK7_ERCC2_ERCC3_GTF2H1_GTF2H2_GTF2H2C_2_...,ESR1


In [41]:
x = (omnipath
     [["source", "target", "source_genesymbol", "target_genesymbol"]]
     .assign(
         source = omnipath["source"].str.split("_"),
         target = omnipath["target"].str.split("_"),
         source_genesymbol = omnipath["source_genesymbol"].str.split("_"),
         target_genesymbol = omnipath["target_genesymbol"].str.split("_"),
     )
    )
x = x.loc[
(x.source_genesymbol.str.len()!=x.source.str.len())|
(x.target_genesymbol.str.len()!=x.target.str.len()), ["source_genesymbol", "target_genesymbol"]
]
for col in x.columns:
    for val in x[col].values:
        print(val)
    

['CCNH', 'CDK7', 'ERCC2', 'ERCC3', 'GTF2H1', 'GTF2H2', 'GTF2H2C', '2', 'GTF2H3', 'GTF2H4', 'GTF2H5', 'MNAT1']
['CCNH', 'CDK7', 'ERCC2', 'ERCC3', 'GTF2H1', 'GTF2H2', 'GTF2H2C', '2', 'GTF2H3', 'GTF2H4', 'GTF2H5', 'MNAT1']
['CCNH', 'CDK7', 'ERCC2', 'ERCC3', 'GTF2H1', 'GTF2H2', 'GTF2H2C', '2', 'GTF2H3', 'GTF2H4', 'GTF2H5', 'MNAT1']
['GTF2E1', 'GTF2E2']
['CCNH', 'CDK7', 'ERCC2', 'ERCC3', 'GTF2H1', 'GTF2H2', 'GTF2H2C', '2', 'GTF2H3', 'GTF2H4', 'GTF2H5', 'MNAT1']
['CCNH', 'CDK7', 'ERCC2', 'ERCC3', 'GTF2H1', 'GTF2H2', 'GTF2H2C', '2', 'GTF2H3', 'GTF2H4', 'GTF2H5', 'MNAT1']
['NR5A1']
['POLR2A']
['AR']
['CCNH', 'CDK7', 'ERCC2', 'ERCC3', 'GTF2H1', 'GTF2H2', 'GTF2H2C', '2', 'GTF2H3', 'GTF2H4', 'GTF2H5', 'MNAT1']
['E2F1']
['ESR1']


In [42]:
# Unfold complexes
omnipath_cols = (omnipath
                 [["source_genesymbol", "target_genesymbol"]]
                 .assign(
                     source_genesymbol = omnipath["source_genesymbol"].str.split("_"),
                     target_genesymbol = omnipath["target_genesymbol"].str.split("_"),
                 )
                 .explode("source_genesymbol").explode("target_genesymbol")
                 .drop_duplicates()
                )

print(len(omnipath_cols))
omnipath_cols.head()

108606


Unnamed: 0,source_genesymbol,target_genesymbol
0,CALM1,TRPC1
1,CALM3,TRPC1
2,CALM2,TRPC1
3,CAV1,TRPC1
4,DRD2,TRPC1


In [43]:
omnipath_cols.loc[omnipath_cols.source_genesymbol=="2", "source_genesymbol"] = "GTF2H2C_2"
omnipath_cols.loc[omnipath_cols.source_genesymbol=="GTF2H2C", "source_genesymbol"] = "GTF2H2C_2"
omnipath_cols.loc[omnipath_cols.target_genesymbol=="2", "source_genesymbol"] = "GTF2H2C_2"
omnipath_cols.loc[omnipath_cols.target_genesymbol=="GTF2H2C", "source_genesymbol"] = "GTF2H2C_2"
print(len(omnipath_cols))
omnipath_cols = omnipath_cols.drop_duplicates()
print(len(omnipath_cols))

108606
108599


In [44]:
omnipath_ids = pd.concat([omnipath_cols.source_genesymbol, omnipath_cols.target_genesymbol]
               ).drop_duplicates(ignore_index=True).rename("omnipathsymbol")

print(len(omnipath_ids))
omnipath_ids.head(2)

10485


0    CALM1
1    CALM3
Name: omnipathsymbol, dtype: object

In [45]:
# map genesymbols and prev_symbols
hgnc_map = (
    pd.merge(omnipath_ids, genesymbols, how="left", left_on="omnipathsymbol", right_on="symbol")
    .merge(prev_symbols, how="left", left_on="omnipathsymbol", right_index=True, suffixes=["_1", "_2"])
)

# select mappings in order
hgnc_map = pd.concat([
    hgnc_map[["omnipathsymbol", "symbol_1"]].dropna(subset="symbol_1").rename(columns={"symbol_1": "symbol"}),
    hgnc_map.loc[hgnc_map.symbol_1.isna(), ["omnipathsymbol", "symbol_2"]].dropna(subset="symbol_2").rename(columns={"symbol_2": "symbol"}),
]).set_index(["omnipathsymbol"]).symbol

print(len(hgnc_map))
print(len(hgnc_map.index.drop_duplicates()))
hgnc_map.head(2)

10428
10428


omnipathsymbol
CALM1    CALM1
CALM3    CALM3
Name: symbol, dtype: string

In [46]:
# Uniprot IDs are unique and so can be used to merge symbols with graph
omnipath_graph = (omnipath_cols
                 .merge(hgnc_map, left_on="source_genesymbol", right_index=True)
                 .merge(hgnc_map, left_on="target_genesymbol", right_index=True, suffixes=["_A", "_B"])
                 [["symbol_A", "symbol_B"]].rename(columns={"symbol_A": "protein_A", "symbol_B": "protein_B"})
                ).drop_duplicates()
print(len(omnipath_graph))
omnipath_graph.head(2)

106749


Unnamed: 0,protein_A,protein_B
0,CALM1,TRPC1
1,CALM3,TRPC1


In [47]:
# Mostrar os resultados
genelist = set()
for col in omnipath_graph.columns:
    genes = omnipath_graph[col].unique().tolist()
    print(f"Proteins in column {col.split('_')[1]}: {len(genes)}")
    genelist |= set(genes)

print(f"Number of proteins (nodes): {len(genelist)}")
print(f"Number of interactions (edges): {len(omnipath_graph)}")

Proteins in column A: 7011
Proteins in column B: 9030
Number of proteins (nodes): 10408
Number of interactions (edges): 106749


In [48]:
omnipath_graph.to_csv(datadir+'omnipath_graph.csv', index=False)