In [1]:
import pandas as pd

import cell2cell as c2c

import os, glob

from tqdm import tqdm

  import pandas.util.testing as pdt


# Load Data

## *C. elegans* RNA-seq

In [2]:
rnaseq_data = pd.read_excel('../Data/RNA-Seq/Celegans_RNASeqData_Cell.xlsx')

In [3]:
rnaseq_data.head()

Unnamed: 0,gene_id,symbol,Germline,Intestinal/rectal_muscle,Coelomocytes,Ciliated_sensory_neurons,Seam_cells,Non-seam_hypodermis,Pharyngeal_epithelia,Touch_receptor_neurons,...,Oxygen_sensory_neurons,Somatic_gonad_precursors,flp-1(+)_interneurons,Canal_associated_neurons,Pharyngeal_gland,Sex_myoblasts,Excretory_cells,Socket_cells,Rectum,Intestine
0,WBGene00000001,aap-1,62.497001,10.205136,23.522258,2.60164,17.684936,6.12507,19.126607,63.349737,...,34.100314,38.82104,34.322845,15.496668,2.756256,33.44604,11.550903,17.451509,0.0,25.809255
1,WBGene00000002,aat-1,0.0,0.0,73.422178,16.976302,4.458539,7.957491,190.043766,0.0,...,3.173948,87.773542,0.0,6.71949,255.23215,205.181741,0.0,0.0,40.750949,1.025635
2,WBGene00000003,aat-2,0.0,80.016157,26.607997,0.0,0.820582,8.642118,9.406249,11.976048,...,78.474267,74.523513,3.885366,0.0,20.001143,163.596066,0.0,8.374096,4.06296,0.0
3,WBGene00000004,aat-3,0.24865,52.852916,8.283492,27.344298,14.230394,16.588887,0.0,10.967077,...,65.493464,54.982436,58.661807,0.0,4.683885,79.587685,43.392868,76.614359,44.354308,5.056305
4,WBGene00000005,aat-4,0.118407,2.012866,2.159463,0.0,0.31694,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.171277


## STRING-db

In [4]:
stringdb = c2c.io.load_ppi(ppi_file='../Data/PPI-Networks/Celegans-STRING-db-mapped-highconfidence.txt.gz',
                           interaction_columns=['protein1', 'protein2'],
                           format='auto')

Opening PPI datasets from ../Data/PPI-Networks/Celegans-STRING-db-mapped-highconfidence.txt.gz
../Data/PPI-Networks/Celegans-STRING-db-mapped-highconfidence.txt.gz was correctly loaded
Removing bidirectionality of PPI network
Simplying PPI network


In [5]:
stringdb.head()

Unnamed: 0,A,B,score
0,WBGene00007064,WBGene00009059,1.0
1,WBGene00007064,WBGene00019544,1.0
2,WBGene00007064,WBGene00012532,1.0
3,WBGene00007067,WBGene00016874,1.0
4,WBGene00007067,WBGene00010045,1.0


In [6]:
stringdb.shape

(69254, 3)

## RSPGM

In [7]:
rspgm = c2c.io.load_ppi(ppi_file='../Data/PPI-Networks/RSPGM.xlsx',
                        interaction_columns=['WormBase_ID_a', 'WormBase_ID_b'],
                        format='auto')

Opening PPI datasets from ../Data/PPI-Networks/RSPGM.xlsx
../Data/PPI-Networks/RSPGM.xlsx was correctly loaded
Removing bidirectionality of PPI network
Simplying PPI network


In [8]:
rspgm.head()

Unnamed: 0,A,B,score
0,WBGene00003525,WBGene00009681,1.0
1,WBGene00004964,WBGene00000024,1.0
2,WBGene00000024,WBGene00004098,1.0
3,WBGene00000024,WBGene00000031,1.0
4,WBGene00000024,WBGene00004099,1.0


In [9]:
rspgm.shape

(12951, 3)

## Obtained from Literature

In [10]:
literature = c2c.io.load_ppi(ppi_file='../Data/PPI-Networks/Literature-Curated.xlsx',
                             interaction_columns=['Ligands_WB', 'Receptors_WB'],
                             strna='',
                             format='auto')

Opening PPI datasets from ../Data/PPI-Networks/Literature-Curated.xlsx
../Data/PPI-Networks/Literature-Curated.xlsx was correctly loaded
Removing bidirectionality of PPI network
Simplying PPI network


In [11]:
literature.head()

Unnamed: 0,A,B,score
0,,WBGene00009717,1.0
1,WBGene00001185,WBGene00001184,1.0
2,WBGene00002992,WBGene00002299,1.0
3,WBGene00002881,WBGene00001184,1.0
4,WBGene00006869,WBGene00006868,1.0


In [12]:
literature.shape

(138, 3)

## Human data to convert into *C. elegans*

**Ramilowski et al. (2015) - Draft of Ligand-Receptor pairs**

https://www.nature.com/articles/ncomms8866

In [13]:
LR_pairs = pd.read_csv('../Data/PPI-Networks/Human-2015-Ramilowski-LR-pairs.txt', sep='\t')

In [14]:
LR_pairs.head()

Unnamed: 0,Pair.Name,Ligand.ApprovedSymbol,Ligand.Name,Receptor.ApprovedSymbol,Receptor.Name,DLRP,HPMR,IUPHAR,HPRD,STRING.binding,STRING.experiment,HPMR.Ligand,HPMR.Receptor,PMID.Manual,Pair.Source,Pair.Evidence
0,A2M_LRP1,A2M,alpha-2-macroglobulin,LRP1,low density lipoprotein receptor-related prote...,,HPMR,,HPRD,STRING.binding,STRING.experiment,A2M,LRP1,,known,literature supported
1,AANAT_MTNR1A,AANAT,aralkylamine N-acetyltransferase,MTNR1A,melatonin receptor 1A,,HPMR,,,,,AANAT,MTNR1A,,known,literature supported
2,AANAT_MTNR1B,AANAT,aralkylamine N-acetyltransferase,MTNR1B,melatonin receptor 1B,,HPMR,,,,,AANAT,MTNR1B,,known,literature supported
3,ACE_AGTR2,ACE,angiotensin I converting enzyme,AGTR2,"angiotensin II receptor, type 2",,,,HPRD,,,ACE,AGTR2,,novel,literature supported
4,ACE_BDKRB2,ACE,angiotensin I converting enzyme,BDKRB2,bradykinin receptor B2,,,,HPRD,,,ACE,BDKRB2,,novel,literature supported


In [15]:
LR_pairs.shape

(2557, 16)

## OrthoDB data

In [16]:
orthodb = pd.read_excel('../Data/Orthologs/OrthoDB-Celegans-Human.xlsx')

In [17]:
orthodb.head()

Unnamed: 0,Protein,og_name,pub_gene_id,description,WB
0,SLIT3,slit homolog 1 protein,slt-1,Slit homolog 1 protein,WBGene00004854
1,SERPINE2,Serpin family,srp-2,SeRPin,WBGene00005643
2,SERPINE2,Serpin family,srp-8,SeRPin,WBGene00005649
3,SERPINE2,Serpin family,srp-1,SeRPin,WBGene00005642
4,SERPINE2,Serpin family,srp-7,SeRPin,WBGene00005648


## OrthoList 2.0 - Database to map human genes into *C. elegans* genes

https://doi.org/10.1534/genetics.118.301307

In [18]:
ortholist = pd.read_excel('../Data/Orthologs/OrthoList2-Celegans-Human.xlsx')

In [19]:
ortholist.head()

Unnamed: 0,CE_WB_CURRENT,HS_ENSG,Databases,Score,COMMON_NAME,LOCUS_ID,AHRINGER_LOC,INTERPRO_DOM,SMART,GO,HGNC,OMIM_GENES,OMIM_PHENOTYPES
0,WBGene00000001,ENSG00000105647,Ensembl Compara 87-89|InParanoid|OrthoInspecto...,4,aap-1,Y110A7A.10,,"INTERPRO:IPR000980 ""SH2 domain""|INTERPRO:IPR00...",SM00252|SM00324|SM00326,1-phosphatidylinositol-3-kinase activity|1-pho...,PIK3R2,603157.0,Megalencephaly-polymicrogyria-polydactyly-hydr...
1,WBGene00000001,ENSG00000117461,Ensembl Compara 87-89|InParanoid|OrthoInspecto...,4,aap-1,Y110A7A.10,,"INTERPRO:IPR000980 ""SH2 domain""|INTERPRO:IPR00...",SM00252,1-phosphatidylinositol-3-kinase activity|1-pho...,PIK3R3,,
2,WBGene00000001,ENSG00000145675,Ensembl Compara 87-89|Homologene|InParanoid|Or...,5,aap-1,Y110A7A.10,,"INTERPRO:IPR000980 ""SH2 domain""|INTERPRO:IPR00...",SM00252|SM00324|SM00326,1-phosphatidylinositol-3-kinase activity|1-pho...,PIK3R1,171833.0,"Agammaglobulinemia 7, autosomal recessive, 615..."
3,WBGene00000001,ENSG00000268173,Ensembl Compara 87-89,1,aap-1,Y110A7A.10,,"INTERPRO:IPR000980 ""SH2 domain""|INTERPRO:IPR00...",SM00252|SM00324|SM00326,,,,
4,WBGene00000001,ENSG00000278139,Ensembl Compara 87-89,1,aap-1,Y110A7A.10,,"INTERPRO:IPR000980 ""SH2 domain""|INTERPRO:IPR00...",SM00252,1-phosphatidylinositol-3-kinase regulator acti...,,,


## gProfiler - a web-based toolset for functional profiling of gene lists from large-scale experiments

https://doi.org/10.1093/nar/gkm226

In [20]:
gprofiler_ligands = pd.read_csv('../Data/Orthologs/gProfiler-Human-Celegans-Ligand-Orthologs.csv')

In [21]:
gprofiler_ligands.head()

Unnamed: 0,g#,initial_alias,initial_ensg,o#,ortholog_name,ortholog_ensg,description
0,1,A2M,ENSG00000175899,1.1.1,tep-1,WBGene00013969,TEP (ThiolEster containing Protein) [Source:U...
1,2,AANAT,ENSG00000129673,2.1.1,,,
2,3,ACE,ENSG00000159640,3.1.1,,,
3,4,ADAM10,ENSG00000137845,4.1.1,,,
4,5,ADAM12,ENSG00000148848,5.1.1,,,


In [22]:
gprofiler_receptors = pd.read_csv('../Data/Orthologs/gProfiler-Human-Celegans-Receptor-Orthologs.csv')

In [23]:
gprofiler_receptors.head()

Unnamed: 0,g#,initial_alias,initial_ensg,o#,ortholog_name,ortholog_ensg,description
0,1,LRP1,ENSG00000123384,1.1.1,,,
1,2,MTNR1A,ENSG00000168412,2.1.1,,,
2,3,MTNR1B,ENSG00000134640,3.1.1,,,
3,4,AGTR2,ENSG00000180772,4.1.1,,,
4,5,BDKRB2,ENSG00000168398,5.1.1,,,


In [24]:
gprofiler_ligands = gprofiler_ligands.loc[~gprofiler_ligands['ortholog_ensg'].isnull()].reset_index(drop=True)

In [25]:
gprofiler_receptors = gprofiler_receptors.loc[~gprofiler_receptors['ortholog_ensg'].isnull()].reset_index(drop=True)

# Extract list of Ligands and Receptors

In [26]:
ligands = set()
receptors = set()

for pair in tqdm(LR_pairs['Pair.Name'].values):
    ligand, receptor = pair.split('_')
    
    new_ligands = ortholist[ortholist.HGNC == ligand]['CE_WB_CURRENT'].values.tolist() \
                  + orthodb[orthodb.Protein == ligand]['WB'].values.tolist() \
                  + gprofiler_ligands[gprofiler_ligands.initial_alias == ligand]['ortholog_ensg'].values.tolist()
    
    new_receptors = ortholist[ortholist.HGNC == receptor]['CE_WB_CURRENT'].values.tolist() \
                    + orthodb[orthodb.Protein == receptor]['WB'].values.tolist() \
                    + gprofiler_receptors[gprofiler_receptors.initial_alias == receptor]['ortholog_ensg'].values.tolist()
    
    for l in new_ligands:
        ligands.add(l)
    for r in new_receptors:
        receptors.add(r)

        
        
ligands = ligands.union(set(literature['A'].values.tolist()))
ligands = list(ligands)
receptors = receptors.union(set(literature['B'].values.tolist()))
receptors = list(receptors)

100%|██████████| 2557/2557 [00:20<00:00, 127.49it/s]


In [27]:
#ligands.remove('')
#receptors.remove('')

In [28]:
print('There are {} ligands and {} receptors'.format(len(ligands), len(receptors)))

There are 619 ligands and 698 receptors


**Number of shared ligand-receptors -> May need manual curation**

In [29]:
len(set(ligands).intersection(set(receptors)))

108

In [30]:
set(ligands).intersection(set(receptors))

{'',
 'WBGene00000095',
 'WBGene00000149',
 'WBGene00000168',
 'WBGene00000395',
 'WBGene00000396',
 'WBGene00000674',
 'WBGene00000708',
 'WBGene00000792',
 'WBGene00001106',
 'WBGene00001403',
 'WBGene00001609',
 'WBGene00001612',
 'WBGene00001613',
 'WBGene00001614',
 'WBGene00001615',
 'WBGene00001616',
 'WBGene00001617',
 'WBGene00001618',
 'WBGene00001687',
 'WBGene00001980',
 'WBGene00002081',
 'WBGene00002243',
 'WBGene00002282',
 'WBGene00002976',
 'WBGene00003001',
 'WBGene00003071',
 'WBGene00003072',
 'WBGene00003169',
 'WBGene00003482',
 'WBGene00003497',
 'WBGene00003774',
 'WBGene00003775',
 'WBGene00003929',
 'WBGene00004211',
 'WBGene00004371',
 'WBGene00004374',
 'WBGene00004732',
 'WBGene00006619',
 'WBGene00006623',
 'WBGene00006625',
 'WBGene00006780',
 'WBGene00006987',
 'WBGene00007565',
 'WBGene00008314',
 'WBGene00008449',
 'WBGene00008477',
 'WBGene00008779',
 'WBGene00009114',
 'WBGene00009386',
 'WBGene00009854',
 'WBGene00009856',
 'WBGene00009857',
 'WBGen

# Look for LR pairs in PPI networks

In [31]:
LR_in_stringdb = c2c.preprocessing.filter_ppi_network(stringdb,
                                                      contact_proteins=ligands,
                                                      mediator_proteins=receptors,
                                                      interaction_type='mediated',
                                                      bidirectional=False,
                                                     )
LR_in_stringdb.columns = ['Ligand', 'Receptor', 'score']

Filtering PPI interactions by using a list of genes for mediated interactions


In [32]:
LR_in_rspgm = c2c.preprocessing.filter_ppi_network(rspgm,
                                                   contact_proteins=ligands,
                                                   mediator_proteins=receptors,
                                                   interaction_type='mediated',
                                                   bidirectional=False,
                                                   )
LR_in_rspgm.columns = ['Ligand', 'Receptor', 'score']

Filtering PPI interactions by using a list of genes for mediated interactions


In [33]:
LR_in_literature = c2c.preprocessing.filter_ppi_network(literature,
                                                        contact_proteins=ligands,
                                                        mediator_proteins=receptors,
                                                        interaction_type='mediated',
                                                        bidirectional=False,
                                                        )
LR_in_literature.columns = ['Ligand', 'Receptor', 'score']

Filtering PPI interactions by using a list of genes for mediated interactions


In [34]:
print('LR pairs in stringdb: {}'.format(LR_in_stringdb.shape[0]))
print('LR pairs in rspgm: {}'.format(LR_in_rspgm.shape[0]))
print('LR pairs in literature: {}'.format(LR_in_literature.shape[0]))

LR pairs in stringdb: 550
LR pairs in rspgm: 44
LR pairs in literature: 144


In [35]:
celegans_LR_pairs = pd.concat([LR_in_stringdb, LR_in_rspgm, LR_in_literature]).reset_index(drop=True)

In [36]:
celegans_LR_pairs

Unnamed: 0,Ligand,Receptor,score
0,WBGene00001613,WBGene00003774,1.0
1,WBGene00001613,WBGene00006779,1.0
2,WBGene00001613,WBGene00003775,1.0
3,WBGene00000182,WBGene00004040,1.0
4,WBGene00000182,WBGene00006780,1.0
...,...,...,...
733,WBGene00003001,WBGene00000168,1.0
734,WBGene00003001,WBGene00001106,1.0
735,WBGene00001609,WBGene00000168,1.0
736,WBGene00001609,WBGene00001106,1.0


In [37]:
celegans_LR_pairs = celegans_LR_pairs.drop_duplicates().reset_index(drop=True)

In [38]:
len(celegans_LR_pairs)

694

In [39]:
celegans_LR_pairs.to_csv('../Data/PPI-Networks/Celegans-Ortholog-LR-pairs.csv', index=False)

In [40]:
print('Number of Ligands: {}'.format(celegans_LR_pairs.Ligand.unique().shape[0]))
print('Number of Receptors: {}'.format(celegans_LR_pairs.Receptor.unique().shape[0]))

Number of Ligands: 242
Number of Receptors: 174
