## SIDER Side-Effect Drug-Set Library
### Drug-set labels: Side Effects
#### ALL DATABASES ACCESSED 05/01/19
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [1]:
import json
import pandas as pd
import requests
import time
from collections import defaultdict
import os

In [2]:
os.chdir('../../scripts')
from export_script import *
os.chdir('../notebooks/SIDER')

### Importing SIDER data
#### Database Accessed : http://sideeffects.embl.de/download/
#### Input Files : meddra_all_se.tsv 

In [3]:
# Import all PubChem IDs and matched side effects #
df_data = pd.read_csv('input/meddra_all_se.tsv', delimiter = '\t', names = ['STITCH_ID_FLAT','STITCH_ID_STEREO',
                                                                      'UMLS_ID_Label','MedDRA_Concept_Type',
                                                                      'UMLS_ID_MedDRA','Side_Effect'])
df_data['Side_Effect'] = df_data['Side_Effect'].str.lower()
df_data = df_data.dropna()

In [4]:
df_data.head()

Unnamed: 0,STITCH_ID_FLAT,STITCH_ID_STEREO,UMLS_ID_Label,MedDRA_Concept_Type,UMLS_ID_MedDRA,Side_Effect
0,CID100000085,CID000010917,C0000729,LLT,C0000729,abdominal cramps
1,CID100000085,CID000010917,C0000729,PT,C0000737,abdominal pain
2,CID100000085,CID000010917,C0000737,LLT,C0000737,abdominal pain
3,CID100000085,CID000010917,C0000737,PT,C0687713,gastrointestinal pain
4,CID100000085,CID000010917,C0000737,PT,C0000737,abdominal pain


In [5]:
# Each drug-side effect pair is essentially a duplicate with one entry being the "Lowest Level Term"
# and the other being a "Preferred Term". I will only associate drugs with their "Preferred Term"
df_data = df_data[~df_data.MedDRA_Concept_Type.str.contains("LLT")]

In [6]:
len(df_data)

163206

In [7]:
# Exporting edited table with column labels
df_data.to_csv('input/SIDER_meddra_metadata.tsv', sep = '\t', index = False)

### Mapping STITCH IDs to Drugbank IDs

In [8]:
# Converting STITCH IDs to Pubchem IDs
df_data['STITCH_ID_STEREO'] = df_data['STITCH_ID_STEREO'].apply(lambda x: int(x[4:]))
df_data = df_data.rename(columns = {'STITCH_ID_STEREO':'pubchem_id'})

In [9]:
drugbank_mapping = pd.read_csv('../../metadata/mapping_files/pubchem.tsv', sep = '\t')
drugbank_mapping.head()

Unnamed: 0,drugbank_id,pubchem_id,inchi_key
0,DB00006,101041682,OIRCOABEOLEUMC-GEJPAHFPSA-N
1,DB00006,126480209,OIRCOABEOLEUMC-GEJPAHFPSA-N
2,DB00006,132229728,OIRCOABEOLEUMC-GEJPAHFPSA-N
3,DB00006,134827539,OIRCOABEOLEUMC-GEJPAHFPSA-N
4,DB00006,137167482,OIRCOABEOLEUMC-GEJPAHFPSA-N


In [10]:
# Merge the two dataframes so that a DrugBank ID is matched to each STITCH ID
df_data = drugbank_mapping.merge(df_data)

In [11]:
df_data.head()

Unnamed: 0,drugbank_id,pubchem_id,inchi_key,STITCH_ID_FLAT,UMLS_ID_Label,MedDRA_Concept_Type,UMLS_ID_MedDRA,Side_Effect
0,DB00006,16129704,OIRCOABEOLEUMC-GEJPAHFPSA-N,CID116129704,C0000737,PT,C0687713,gastrointestinal pain
1,DB00006,16129704,OIRCOABEOLEUMC-GEJPAHFPSA-N,CID116129704,C0000737,PT,C0000737,abdominal pain
2,DB00006,16129704,OIRCOABEOLEUMC-GEJPAHFPSA-N,CID116129704,C0002792,PT,C0002792,anaphylactic shock
3,DB00006,16129704,OIRCOABEOLEUMC-GEJPAHFPSA-N,CID116129704,C0002871,PT,C0002871,anaemia
4,DB00006,16129704,OIRCOABEOLEUMC-GEJPAHFPSA-N,CID116129704,C0002962,PT,C0002962,angina pectoris


### Matching Side Effects with Verified Drug Names

In [12]:
# Tupelizing the lists so that duplicate side effects paired to each compound id remain unique 
id_dict = tuple(zip(df_data['Side_Effect'].tolist(),df_data['drugbank_id'].tolist()))

In [13]:
# Creating a drug-set library where side effects are matched to all drugs with which they are associated 
drugsetlibrary = defaultdict(list)
for k, v in id_dict:
    drugsetlibrary[k].append(v)

In [14]:
len(drugsetlibrary)

4134

In [15]:
# Removing terms with less than 5 drug associations
drugsetlibrary = {k:list(set(v)) for k,v in drugsetlibrary.items() if len(set(v))>=5}

### Library counts

In [16]:
library_counts(drugsetlibrary)

1636 unique drugs
2078 unique association terms
155116 unique associations
74.64677574590952 average drugs per term


### Exporting the drug-set library in GMT format

In [17]:
os.chdir('../../data/SIDER')

In [18]:
gmt_formatter(drugsetlibrary, 'SIDER_side_effects_drugsetlibrary.gmt')