## STITCH Targets Drugsetlibrary
### Each chemical-protein interaction from STITCH is scored on a scale from 150 to 1000 based on the confidence of the interaction. Drug-protein interaction coverage across various cut-offs will be assessed.

#### ALL DATABASES ACCESSED 10/2019
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [1]:
import pandas as pd
import requests
import time
import numpy as np
import os
from collections import defaultdict

In [2]:
os.chdir('../../scripts')
from export_script import *
os.chdir('../notebooks/STITCH')

### Importing lookup table for converting STITCH chemical ids to DrugBank IDs

In [3]:
interactions = pd.read_csv('input/9606.protein_chemical.links.v5.0.tsv', sep='\t')

In [4]:
interactions.head(3)

Unnamed: 0,chemical,protein,combined_score
0,CIDm91758680,9606.ENSP00000257254,279
1,CIDm91758680,9606.ENSP00000302120,154
2,CIDm91758408,9606.ENSP00000006777,225


In [5]:
# Converting STITCH compound identifier to PubChem ID
interactions['pubchem_id'] = interactions['chemical'].apply(lambda x: int(x[4:]))

In [6]:
interactions.head(3)

Unnamed: 0,chemical,protein,combined_score,pubchem_id
0,CIDm91758680,9606.ENSP00000257254,279,91758680
1,CIDm91758680,9606.ENSP00000302120,154,91758680
2,CIDm91758408,9606.ENSP00000006777,225,91758408


In [7]:
# Import DrugBank mapping file
drugbank_mapping = pd.read_csv('../../metadata/mapping_files/pubchem.tsv', sep = '\t')

In [8]:
drugbank_mapping.head(3)

Unnamed: 0,drugbank_id,pubchem_id,inchi_key
0,DB00006,101041682,OIRCOABEOLEUMC-GEJPAHFPSA-N
1,DB00006,126480209,OIRCOABEOLEUMC-GEJPAHFPSA-N
2,DB00006,132229728,OIRCOABEOLEUMC-GEJPAHFPSA-N


In [9]:
interactions_drugbank = interactions.merge(drugbank_mapping)

In [10]:
interactions_drugbank.head()

Unnamed: 0,chemical,protein,combined_score,pubchem_id,drugbank_id,inchi_key
0,CIDm91758271,9606.ENSP00000012134,194,91758271,DB00683,DDLIGBOFAVUZHB-UHFFFAOYSA-N
1,CIDm91758271,9606.ENSP00000012134,194,91758271,DB00813,PJMPHNIQZUBGLI-UHFFFAOYSA-N
2,CIDm91758271,9606.ENSP00000181383,162,91758271,DB00683,DDLIGBOFAVUZHB-UHFFFAOYSA-N
3,CIDm91758271,9606.ENSP00000181383,162,91758271,DB00813,PJMPHNIQZUBGLI-UHFFFAOYSA-N
4,CIDm91758271,9606.ENSP00000200652,168,91758271,DB00683,DDLIGBOFAVUZHB-UHFFFAOYSA-N


### Importing protein ID to entrez gene symbol mapping lookup table

In [11]:
entrez_lookup = pd.read_csv('input/STRING_to_entrez.csv')

### Importing library specific functions for parsing data

In [12]:
def get_interactionlist(score_cutoff, interactions, entrez_lookup):
    '''
    Returns an 'interaction list' representation of a dataframe
    This is an nx2 pandas.DataFrame where each row gives a gene and corresponding drug, e.g.
        gene    drug
        MSRA    CHEMBL406270
        TIP1    CHEMBL406270
        EPN1    CHEMBL279107
        TIP1    CHEMBL47181
        ...     ...

    Parameters:
    score_cutoff (int): Specifies cut-off value for interaction_scores between chemicals and proteins
    interactions (dataframe) : input DataFrame of chemical-protein interactions
    '''

    print('removing low-confidence interactions')
    interactions = interactions.loc[interactions['combined_score']>= score_cutoff]

    print('converting STRING protein ids to gene symbols')
    # Use protein lookup table to convert STRING protein ids to Entrez Gene Symbols
    # The .merge pandas function will only combine based on common STITCH protein identifiers between the interactions dataframe and protein lookup table, thus removing
    # STRING protein ids paired to unapproved symbols
    interactions = entrez_lookup.merge(interactions)

    print('returning output')
    #Keeping the combined_score column
    interactions = interactions.loc[~interactions['protein'].isnull().values,]
    interactions = interactions.loc[~interactions['drugbank_id'].isnull().values,]
    interactions = interactions.drop_duplicates(subset=['protein','drugbank_id'])

    # Calculating average number of protein interactions for each chemical
    count = interactions[['drugbank_id','gene symbol']].describe()
    print('')
    print(count)
    print('')
    interact_number= str((count.iloc[0]['drugbank_id'])/(count.iloc[1]['drugbank_id']))
    print('Average chemical-protein interactions: ' + interact_number)

    return interactions

def drugsetlibrary_converter(interactions, cutoff = 5):
    # Tupelizing the lists of gene symbols and chemical names so that duplicate protein ids paired to each compound id remain unique 
    id_dict = tuple(zip(interactions['approved symbol'].tolist(), interactions['inchi_key'].tolist()))

    # Creating a drug-set library where gene symbols are matched to all chemicals with which they are associated 
    drugsetlibrary = defaultdict(list)
    for k, v in id_dict:
        drugsetlibrary[k].append(v)

    # Retaining drug-sets with only specified cutoff number of drugs per set
    drugsetlibrary = {k:list(set(v)) for k,v in drugsetlibrary.items() if len(set(v))>=cutoff} # Cutoff can be user-specified (default = 5)

    return drugsetlibrary

### Each chemical-protein interaction is scored on a scale from 150 to 1000 based on the confidence of the interaction
### Filtering the interaction list by combined_score cutoff ranging from 500-900
#### Input Files: STRING_to_Entrez.tsv (generated from STRING_to_Entrez.ipynb) | 9606.protein_chemical.links (http://stitch.embl.de/cgi/download.pl?UserId=RZH952LT3dmF&sessionId=8ndHqEEYWEuz&species_text=Homo+sapiens)

## 500

In [13]:
interactions_500 = get_interactionlist(500, interactions_drugbank, entrez_lookup)

removing low-confidence interactions
converting STRING protein ids to gene symbols
returning output

       drugbank_id gene symbol
count       813345      813345
unique        7357       11467
top        DB14497        NOS3
freq          5825         609

Average chemical-protein interactions: 110.55389425037379


In [14]:
interactions_500.head()

Unnamed: 0,protein,gene symbol,approved symbol,chemical,combined_score,pubchem_id,drugbank_id,inchi_key
0,9606.ENSP00000376139,WDR45L,WDR45B,CIDm00001117,704,1117,DB00653,CSNNHWWHGAXBCP-UHFFFAOYSA-L
1,9606.ENSP00000376139,WDR45L,WDR45B,CIDm00001117,704,1117,DB06778,ARUVKPQLZAKDPS-UHFFFAOYSA-L
2,9606.ENSP00000376139,WDR45L,WDR45B,CIDm00001117,704,1117,DB08938,SPPNVMTVMQOKSC-UHFFFAOYSA-A
3,9606.ENSP00000376139,WDR45L,WDR45B,CIDm00001117,704,1117,DB09087,GRLPQNLYRHEGIJ-UHFFFAOYSA-J
4,9606.ENSP00000376139,WDR45L,WDR45B,CIDm00001117,704,1117,DB09322,NWONKYPBYAMBJT-UHFFFAOYSA-L


In [15]:
drugsetlibrary_500 = drugsetlibrary_converter(interactions_500)

In [16]:
library_counts(drugsetlibrary_500)

7303 unique drugs
9063 unique association terms
807127 unique associations
89.05737614476443 average drugs per term


In [17]:
gmt_formatter(drugsetlibrary_500, '../../data/STITCH/STITCH_target_drugsetlibrary.gmt')

## 600

In [18]:
interactions_600 = get_interactionlist(600, interactions_drugbank, entrez_lookup)

removing low-confidence interactions
converting STRING protein ids to gene symbols
returning output

       drugbank_id gene symbol
count       663620      663620
unique        7059       10453
top        DB14497        NOS3
freq          5213         570

Average chemical-protein interactions: 94.01048307125656


In [19]:
interactions_600.head()

Unnamed: 0,protein,gene symbol,approved symbol,chemical,combined_score,pubchem_id,drugbank_id,inchi_key
0,9606.ENSP00000376139,WDR45L,WDR45B,CIDm00001117,704,1117,DB00653,CSNNHWWHGAXBCP-UHFFFAOYSA-L
1,9606.ENSP00000376139,WDR45L,WDR45B,CIDm00001117,704,1117,DB06778,ARUVKPQLZAKDPS-UHFFFAOYSA-L
2,9606.ENSP00000376139,WDR45L,WDR45B,CIDm00001117,704,1117,DB08938,SPPNVMTVMQOKSC-UHFFFAOYSA-A
3,9606.ENSP00000376139,WDR45L,WDR45B,CIDm00001117,704,1117,DB09087,GRLPQNLYRHEGIJ-UHFFFAOYSA-J
4,9606.ENSP00000376139,WDR45L,WDR45B,CIDm00001117,704,1117,DB09322,NWONKYPBYAMBJT-UHFFFAOYSA-L


## 700

In [20]:
interactions_700 = get_interactionlist(700, interactions_drugbank, entrez_lookup)

removing low-confidence interactions
converting STRING protein ids to gene symbols
returning output

       drugbank_id gene symbol
count       524417      524417
unique        6708        9418
top        DB14497       CASP3
freq          4453         525

Average chemical-protein interactions: 78.177847346452


In [21]:
interactions_700.head()

Unnamed: 0,protein,gene symbol,approved symbol,chemical,combined_score,pubchem_id,drugbank_id,inchi_key
0,9606.ENSP00000376139,WDR45L,WDR45B,CIDm00001117,704,1117,DB00653,CSNNHWWHGAXBCP-UHFFFAOYSA-L
1,9606.ENSP00000376139,WDR45L,WDR45B,CIDm00001117,704,1117,DB06778,ARUVKPQLZAKDPS-UHFFFAOYSA-L
2,9606.ENSP00000376139,WDR45L,WDR45B,CIDm00001117,704,1117,DB08938,SPPNVMTVMQOKSC-UHFFFAOYSA-A
3,9606.ENSP00000376139,WDR45L,WDR45B,CIDm00001117,704,1117,DB09087,GRLPQNLYRHEGIJ-UHFFFAOYSA-J
4,9606.ENSP00000376139,WDR45L,WDR45B,CIDm00001117,704,1117,DB09322,NWONKYPBYAMBJT-UHFFFAOYSA-L


## 800

In [22]:
interactions_800 = get_interactionlist(800, interactions_drugbank, entrez_lookup)

removing low-confidence interactions
converting STRING protein ids to gene symbols
returning output

       drugbank_id gene symbol
count       430182      430182
unique        6306        8382
top        DB14497        NOS3
freq          3837         440

Average chemical-protein interactions: 68.21788772597526


In [23]:
interactions_800.head()

Unnamed: 0,protein,gene symbol,approved symbol,chemical,combined_score,pubchem_id,drugbank_id,inchi_key
0,9606.ENSP00000264935,CEP72,CEP72,CIDm00000238,900,238,DB00171,ZKHQWZAMYRWXGA-KQYNXXCUSA-N
2,9606.ENSP00000218316,GPR50,GPR50,CIDm00082148,828,82148,DB06594,YJYPHIXNFHFHND-UHFFFAOYSA-N
4,9606.ENSP00000218316,GPR50,GPR50,CIDm00000896,992,896,DB01065,DRLFMBDRBRZALE-UHFFFAOYSA-N
6,9606.ENSP00000420270,UHMK1,UHMK1,CIDm00000238,819,238,DB00171,ZKHQWZAMYRWXGA-KQYNXXCUSA-N
8,9606.ENSP00000342952,ADCY2,ADCY2,CIDm03052762,800,3052762,DB06480,ZPMNHBXQOOVQJL-UHFFFAOYSA-N


## 900

In [24]:
interactions_900 = get_interactionlist(900, interactions_drugbank, entrez_lookup)

removing low-confidence interactions
converting STRING protein ids to gene symbols
returning output

       drugbank_id gene symbol
count       259756      259756
unique        4736        5955
top        DB14481        NOS3
freq          2888         336

Average chemical-protein interactions: 54.84712837837838


In [25]:
interactions_900.head()

Unnamed: 0,protein,gene symbol,approved symbol,chemical,combined_score,pubchem_id,drugbank_id,inchi_key
0,9606.ENSP00000264935,CEP72,CEP72,CIDm00000238,900,238,DB00171,ZKHQWZAMYRWXGA-KQYNXXCUSA-N
2,9606.ENSP00000218316,GPR50,GPR50,CIDm00000896,992,896,DB01065,DRLFMBDRBRZALE-UHFFFAOYSA-N
4,9606.ENSP00000342952,ADCY2,ADCY2,CIDm00065153,900,65153,DB05767,BOJKULTULYSRAS-OTESTREVSA-N
5,9606.ENSP00000342952,ADCY2,ADCY2,CIDm00016741,900,16741,DB12695,IZJDOKYDEWTZSO-UHFFFAOYSA-N
7,9606.ENSP00000342952,ADCY2,ADCY2,CIDm00014989,900,14989,DB15477,AFHJQYHRLPMKHU-CGISPIQUSA-N
