## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os
import requests
import chardet

## Load Data

#### Load Sider Side Effects Data

In [2]:
df_with_concepts= pd.read_table('Input/meddra_all_se.tsv', names = ['STITCH ID flat','STITCH ID stereo','UMLS Concept Type','Concept Type','UMLS Concept ID', 'Side Effect'])

In [3]:
df_with_concepts.head()

Unnamed: 0,STITCH ID flat,STITCH ID stereo,UMLS Concept Type,Concept Type,UMLS Concept ID,Side Effect
0,CID100000085,CID000010917,C0000729,LLT,C0000729,Abdominal cramps
1,CID100000085,CID000010917,C0000729,PT,C0000737,Abdominal pain
2,CID100000085,CID000010917,C0000737,LLT,C0000737,Abdominal pain
3,CID100000085,CID000010917,C0000737,PT,C0687713,Gastrointestinal pain
4,CID100000085,CID000010917,C0000737,PT,C0000737,Abdominal pain


#### Load Stitch to PubChem ID table

In [None]:
# stitch_PCID = pd.read_table('Input/Stitch_chemname_ID.tsv')

In [None]:
# stitch_PCID.head()

#### Load LINCS Small Molecules

In [56]:
lincs = pd.read_csv('Input/LINCS_SmallMolecules.csv', encoding = 'ISO-8859-1')

In [57]:
lincs.head()

Unnamed: 0,SM_Name,SM_LINCS_ID,SM_Alternative_Name,SM_PubChem_CID,SM_SMILES_Parent,SM_SMILES_Batch,SM_InChi_Parent,SM_Molecular_Mass,MOLECULAR_FORMULA,SM_ChEBI_ID
0,Dichlobenil,LSM-19017,,3031.0,Clc1cccc(Cl)c1C#N,,InChI=1S/C7H3Cl2N/c8-6-2-1-3-7(9)5(6)4-10/h1-3H,172.01,C7H3Cl2N,943.0
1,AC1NWAJC,LSM-43967,Vulpinic acid,5701993.0,COC(=O)\C(=C\1/OC(=O)C(C1=O)c2ccccc2)\c3ccccc3,,InChI=1S/C19H14O5/c1-23-18(21)15(13-10-6-3-7-1...,322.31,C19H14O5,
2,Sinapic Acid Methyl Ether,LSM-44124,,735755.0,COc1cc(\C=C\C(=O)O)cc(OC)c1OC,,InChI=1S/C12H14O5/c1-15-9-6-8(4-5-11(13)14)7-1...,238.24,C12H14O5,
3,Ferulic acid,LSM-44126,Ferulic acid,445858.0,COc1cc(\C=C\C(=O)O)ccc1O,,InChI=1S/C10H10O4/c1-14-9-6-7(2-4-8(9)11)3-5-1...,194.18,C10H10O4,17620.0
4,Pinosylvin Methyl Ether,LSM-43902,,5281719.0,COc1cc(O)cc(\C=C\c2ccccc2)c1,,InChI=1S/C15H14O2/c1-17-15-10-13(9-14(16)11-15...,226.27,C15H14O2,8227.0


## Drop duplicates in the DF

In [4]:
df = df_with_concepts.drop(columns = ['UMLS Concept Type','Concept Type'])

In [5]:
df = df.drop_duplicates()

In [6]:
df.head()

Unnamed: 0,STITCH ID flat,STITCH ID stereo,UMLS Concept ID,Side Effect
0,CID100000085,CID000010917,C0000729,Abdominal cramps
1,CID100000085,CID000010917,C0000737,Abdominal pain
3,CID100000085,CID000010917,C0687713,Gastrointestinal pain
5,CID100000085,CID000010917,C0002418,Amblyopia
7,CID100000085,CID000010917,C0002871,Anaemia


In [7]:
df.shape

(171382, 4)

In [8]:
len(df['STITCH ID flat'].unique())

1430

## Get Drug Name from Stitch API and make dictionary

In [11]:
stitch_ID_df = df['STITCH ID flat']
stitch_ID_df = stitch_ID_df.drop_duplicates()
stitch_ID_df.shape

(1430,)

In [None]:
stitch_ID_list = stitch_ID_df.tolist()
failed_to_get_name = 0
drug_name_dict = {}

for stitch_ID in stitch_ID_list:
    url = 'http://stitch.embl.de/api/json/resolve?identifier=' + stitch_ID
    response = requests.get(url)
    try:
        response.json()
    except ValueError:
        continue
    if 'annotation' in response.json()[0].keys():
        drug_name = response.json()[0]['annotation']
        drug_name_dict[stitch_ID] = drug_name
    else:
        failed_to_get_name +=1
    
    if len(drug_name_dict) % 50 == 0:
        print(stitch_ID_list.index(stitch_ID))
    

print(failed_to_get_name)
print(len(drug_name_dict))
print(len(stitch_ID_list))


In [None]:
# drug_names = []
# for index, row in df.iterrows():
#     stitch_ID = row.loc['STITCH ID flat']
#     if stitch_ID not in drug_name_dict:
#         df.drop(index, inplace = True)
#     else: 
#         drug_names.append(int(drug_name_dict[drugname]))

# df.loc[:,'Drug Names'] = pd.Series(np.array(drug_names), index=df.index)

## Get PubChem ID and Map to Drug Name

In [None]:
# namesdf = df['Drug Name']
# namesdf = namesdf.drop_duplicates()
# namesdf.shape

In [None]:
# nameslist = namesdf.tolist()



failed_to_get_CID = 0
CID_dict = {}
count = 0

for ID, name in drug_name_dict.items():
    count += 1
    name = name.replace(' ', '%20')
    url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/' + name + '/cids/JSON'
    response = requests.get(url)
    try:
        response.json()
    except ValueError:
        continue
    if 'IdentifierList' in response.json().keys():
        CID = response.json()['IdentifierList']['CID'][0]
#         name = name.replace('%20', ' ')
        CID_dict[ID] = CID
    else:
        failed_to_get_CID += 1
        
    if len(CID_dict) % 50 == 0:
#         name = name.replace('%20',' ')
        print(count)


print(failed_to_get_CID)
print(len(CID_dict))
print(len(drug_name_dict))

In [None]:
CIDs = []
for index, row in df.iterrows():
    stitch_id = row.loc['STITCH ID flat']
    if stitch_id not in CID_dict:
        df.drop(index, inplace = True)
    else: 
        CIDs.append(int(CID_dict[stitch_id]))
    if index % 1000 == 0:
        print(index)

df.loc[:,'CIDs'] = pd.Series(np.array(CIDs), index=df.index)

In [61]:
df.shape

(102274, 5)

## Filter by LINCS Approved Small Molecules

In [59]:
count = 0
for index, row in df.iterrows():
    CID = row.loc['CIDs']
    if CID not in lincs['SM_PubChem_CID'].values:
        df.drop(index, inplace = True)
        count += 1
        
print(count)

48904


## Make Binary Matrix

In [63]:
grouped_df = df.groupby(['UMLS Concept ID'])['CIDs'].apply(lambda x: ','.join(x.astype(str))).reset_index()

In [64]:
grouped_df.set_index('UMLS Concept ID', inplace=True)
grouped_df = grouped_df.sort_index()
len(df['UMLS Concept ID'].unique())

5078

In [65]:
grouped_matrix = grouped_df.iloc[:,0].str.get_dummies(sep=',')

In [66]:
grouped_matrix.head()

Unnamed: 0_level_0,10096344,10182969,10184653,1046,104741,104758,104850,104865,10531,1054,...,9853053,9869929,9878,9880,9904,9924495,9926791,9930049,9941444,9966051
UMLS Concept ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0000727,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0000729,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0000731,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
C0000735,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0000737,0,0,0,0,1,1,1,1,1,0,...,1,0,0,0,0,0,1,0,1,0


In [67]:
grouped_matrix.shape

(5077, 734)

## Save Binary Matrix

In [68]:
filename = 'Output/Sider_SE_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
grouped_matrix.to_csv(filename, sep='\t', compression='gzip')

## Convert Binary Matrix to GMT and Save as GMT

In [69]:
drugset_library = []
for index, row in grouped_matrix.iterrows():
    drugset = []
    drugset.append(index)
    drugset.append('')
    for i in range(grouped_matrix.shape[1]):
        if row.iloc[i]==1:
            drugset.append(row.index[i])
    if len(drugset) >= 7:
        drugset_library.append(drugset)
dsl = np.array(drugset_library)

In [70]:
len(max(dsl, key=len))

645

In [71]:
filename = 'Output/Sider_SE_DrugSetLibrary_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')
with open(filename,'w',encoding='utf-8') as f:
    for row in dsl:
        np.savetxt(f, [row], fmt = '%s',delimiter = '\t')