## SIDER Indications Drug-Set Library
### Drug-set labels: Indications
#### ALL DATABASES ACCESSED 01/2020
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [1]:
import json
import pandas as pd
import requests
import time
from collections import defaultdict
import os

In [2]:
os.chdir('../../scripts')
from export_script import *
os.chdir('../notebooks/SIDER')

### Importing SIDER Drugs and Indications
#### Input Files : meddra_all_indications.tsv (http://sideeffects.embl.de/download/)

In [3]:
# Import all STITCH compound IDs and matched indications #
df_data = pd.read_csv('input/meddra_all_indications.tsv', delimiter = '\t', names = ['STITCH_ID_FLAT',
                                                                      'UMLS_ID_Label', 'Detection_Method',
                                                                        'MedDRA_Concept_Name','MedDRA_Concept_Type',
                                                                        'UMLS_ID_MedDRA','Indication'])
df_data['Indication'] = df_data['Indication'].str.lower()
df_data = df_data.dropna()

In [4]:
df_data.head()

Unnamed: 0,STITCH_ID_FLAT,UMLS_ID_Label,Detection_Method,MedDRA_Concept_Name,MedDRA_Concept_Type,UMLS_ID_MedDRA,Indication
0,CID100000085,C0015544,text_mention,Failure to Thrive,LLT,C0015544,failure to thrive
1,CID100000085,C0015544,text_mention,Failure to Thrive,PT,C0015544,failure to thrive
2,CID100000085,C0020615,text_mention,Hypoglycemia,LLT,C0020615,hypoglycaemia
3,CID100000085,C0020615,text_mention,Hypoglycemia,PT,C0020615,hypoglycaemia
4,CID100000085,C0022661,NLP_indication,"Kidney Failure, Chronic",LLT,C0022661,renal failure chronic


In [5]:
# Each drug-side effect pair is essentially a duplicate with one entry being the "Lowest Level Term"
# and the other being a "Preferred Term". I will only associate drugs with their "Preferred Term"
df_data = df_data[~df_data.MedDRA_Concept_Type.str.contains("LLT")]

### Mapping STITCH IDs to Drugbank IDs

In [6]:
# Converting STITCH IDs to Pubchem IDs
df_data['STITCH_ID_FLAT'] = df_data['STITCH_ID_FLAT'].apply(lambda x: int(x[4:]))
df_data = df_data.rename(columns = {'STITCH_ID_FLAT':'pubchem_id'})

In [8]:
drugbank_mapping = pd.read_csv('../../drugbank_lexicon/pubchem.tsv', sep = '\t')
drugbank_mapping.head()

Unnamed: 0,drugbank_id,pubchem_id
0,DB00006,101041682
1,DB00006,126480209
2,DB00006,132229728
3,DB00006,134827539
4,DB00006,137167482


In [9]:
# Merge the two dataframes so that a DrugBank ID is matched to each STITCH ID
df_data = drugbank_mapping.merge(df_data)

In [10]:
df_data.head()

Unnamed: 0,drugbank_id,pubchem_id,UMLS_ID_Label,Detection_Method,MedDRA_Concept_Name,MedDRA_Concept_Type,UMLS_ID_MedDRA,Indication
0,DB00006,16129704,C0002965,NLP_indication,"Angina, Unstable",PT,C0002965,angina unstable
1,DB00006,16129704,C0002965,NLP_indication,"Angina, Unstable",PT,C0002962,angina pectoris
2,DB00006,16129704,C0019080,NLP_precondition,Hemorrhage,PT,C0019080,haemorrhage
3,DB00006,16129704,C0019080,NLP_precondition,Hemorrhage,PT,C0518015,haemoglobin
4,DB00006,16129704,C0027051,NLP_precondition,Myocardial Infarction,PT,C0027051,myocardial infarction


### Matching drugs to indications

In [13]:
# Tupelizing the lists so that duplicate side effects paired to each compound id remain unique 
id_dict = tuple(zip(df_data['Indication'].tolist(),df_data['drugbank_id'].tolist()))

In [14]:
# Creating a drug-set library where side effects are matched to all drugs with which they are associated 
drugsetlibrary = defaultdict(list)
for k,v in id_dict: 
    drugsetlibrary[k].append(v)

In [15]:
# Removing terms with less than 5 drug associations
drugsetlibrary = {k:list(set(v)) for k,v in drugsetlibrary.items() if len(set(v))>=5}

### Library counts

In [16]:
library_counts(drugsetlibrary)

1547 unique drugs
868 unique association terms
18805 unique associations
21.6647465437788 average drugs per term


### Exporting the drug-set library in GMT format

In [17]:
os.chdir('../../data/SIDER')

In [18]:
gmt_formatter(drugsetlibrary, 'SIDER_indications_drugsetlibrary.gmt')