## Import Libraries

In [60]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os
import requests
import chardet

## Load Data

#### Load ChEMBL Data

In [136]:
df_raw = pd.read_csv('Input/chembl_24.0_moa.csv')

In [137]:
df_raw.head(10)

Unnamed: 0,1,2,3,4,5,6,7,8,9,0
0,chembl_moa:CHEMBL_MEC_1664 a cco:Mechanism ;,,,,,,,,,
1,"\trdfs:label ""CHEMBL_MEC_1664"" ;",,,,,,,,,
2,"\tcco:chemblId ""CHEMBL_MEC_1664"" ;",,,,,,,,,
3,\tcco:hasMolecule chembl_molecule:CHEMBL22 .,,,,,,,,,
4,chembl_molecule:CHEMBL22 cco:hasMechanism chem...,,,,,,,,,
5,chembl_moa:CHEMBL_MEC_1664 cco:hasTarget chemb...,,,,,,,,,
6,chembl_target:CHEMBL2364669 cco:isTargetForMec...,,,,,,,,,
7,chembl_moa:CHEMBL_MEC_1664 cco:mechanismDescri...,,,,,,,,,
8,"\tcco:mechanismActionType ""INHIBITOR"" .",,,,,,,,,
9,chembl_moa:CHEMBL_MEC_340 a cco:Mechanism ;,,,,,,,,,


In [138]:
df_raw.shape

(44595, 10)

#### Load LINCS Approved Small Molecules

In [139]:
lincs = pd.read_csv('Input/LINCS_SmallMolecules.csv', encoding = 'ISO-8859-1')

## Make Data useable

In [140]:
type(df_raw['1'][102])

str

In [141]:
df_edit1 = df_raw[df_raw['1'].str.contains("\t")]

In [142]:
df_edit1.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,0
1,"\trdfs:label ""CHEMBL_MEC_1664"" ;",,,,,,,,,
2,"\tcco:chemblId ""CHEMBL_MEC_1664"" ;",,,,,,,,,
3,\tcco:hasMolecule chembl_molecule:CHEMBL22 .,,,,,,,,,
8,"\tcco:mechanismActionType ""INHIBITOR"" .",,,,,,,,,
10,"\trdfs:label ""CHEMBL_MEC_340"" ;",,,,,,,,,


In [143]:
df_label = df_edit1[df_edit1['1'].str.contains('dfs:label')]['1']
df_label.shape

(4992,)

In [144]:
df_label = df_label.apply(lambda L:L.strip('\trdfs:label "CHEMBL_MEC_'))

In [145]:
df_label = df_label.apply(lambda L:L.strip(';'))

In [146]:
df_label = df_label.apply(lambda L:L.replace('"',''))
df_label = df_label.apply(lambda L:L.replace(' ',''))

In [147]:
df_label.head()

1     1664
10     340
19    1905
28    4555
37     902
Name: 1, dtype: object

In [148]:
df_chemblid = df_edit1[df_edit1['1'].str.contains('cco:chemblId')]['1']
df_chemblid.shape

(4992,)

In [149]:
df_chemblid = df_chemblid.apply(lambda L:L.strip('\tcco:chemblId "CHEMBL_MEC_'))
df_chemblid = df_chemblid.apply(lambda L:L.strip('.'))
df_chemblid = df_chemblid.apply(lambda L:L.replace('"',''))

In [150]:
df_chemblid.head()

2     1664 ;
11     340 ;
20    1905 ;
29    4555 ;
38     902 ;
Name: 1, dtype: object

In [151]:
df_molecule = df_edit1[df_edit1['1'].str.contains('cco:hasMolecule chembl_molecule')]['1']
df_molecule.shape

(4992,)

In [152]:
df_molecule = df_molecule.apply(lambda L:L.strip('\tcco:hasMolecule chembl_molecule:'))
df_molecule = df_molecule.apply(lambda L:L.strip(' .'))
df_molecule = df_molecule.apply(lambda L:L.replace('"',''))

In [153]:
df_molecule.head()

3       CHEMBL22
12      CHEMBL35
21       CHEMBL5
30    CHEMBL6318
39     CHEMBL405
Name: 1, dtype: object

In [154]:
df_mechtype = df_edit1[df_edit1['1'].str.contains('cco:mechanismActionType')]
df_mechtype.shape

(4769, 10)

In [155]:
df = pd.DataFrame({'Label':df_label})

In [156]:
df.loc[:,'ChEMBL ID'] = pd.Series(np.array(df_chemblid), index=df.index)

In [157]:
df.loc[:,'Molecule ID'] = pd.Series(np.array(df_molecule), index=df.index)

In [158]:
df.head()

Unnamed: 0,Label,ChEMBL ID,Molecule ID
1,1664,1664 ;,CHEMBL22
10,340,340 ;,CHEMBL35
19,1905,1905 ;,CHEMBL5
28,4555,4555 ;,CHEMBL6318
37,902,902 ;,CHEMBL405


In [159]:
df.shape

(4992, 3)

## Get MOA from Label

In [160]:
# len(df['Molecule ID'].unique())

In [161]:
# df_moa = df_raw[df_raw['1'].str.contains("cco:mechanismDescription")]['1']

In [162]:
# df_moa.head()

In [163]:
# df_moa = df_moa.apply(lambda L:L.strip('chembl_moa:CHEMBL_MEC_1658 cco:mechanismDescription "'))
# df_moa = df_moa.apply(lambda L:L.strip(';'))
# df_moa = df_moa.apply(lambda L:L.replace('"',''))

In [164]:
label_df = df['Label']
label_df = label_df.drop_duplicates()
label_df.shape

(4992,)

In [31]:
label_list = label_df.tolist()
failed_to_get_name = 0
moa_dict = {}
# count = 0

for label in label_list:
#     count += 1
    url = 'https://www.ebi.ac.uk/chembl/api/data/mechanism/' + label + '.json'
    response = requests.get(url)
    try:
        response.json()
    except ValueError:
        continue
    if "mechanism_of_action" in response.json().keys():
        moa = response.json()["mechanism_of_action"]
        moa_dict[label] = moa
    else:
        failed_to_get_name +=1
    
    if len(moa_dict) % 100 == 0:
        print(label_list.index(label))
#     if count > 10:
#         break
print(failed_to_get_name)
print(len(moa_dict))
print(len(label_list))

99
199
299
399
499
599
699
799
899
999
1099
1199
1299
1399
1499
1599
1699
1799
1899
1999
2099
2199
2299
2399
2499
2599
2699
2799
2899
2999
3099
3199
3299
3399
3499
3599
3699
3799
3899
3999
4099
4199
4299
4399
4499
4599
4699
4799
4899
0
4992
4992


In [199]:
# url

'https://www.ebi.ac.uk/chembl/api/data/mechanism/1987.json'

In [200]:
# moa_dict

{'1664': 'Bacterial dihydrofolate reductase inhibitor',
 '340': 'Sodium-(potassium)-chloride cotransporter 2 inhibitor',
 '1905': 'Bacterial DNA gyrase inhibitor',
 '4555': 'Estrogen receptor modulator',
 '902': 'Norepinephrine transporter releasing agent',
 '687': 'Dopamine transporter releasing agent',
 '1588': 'GABA-A receptor; anion channel allosteric antagonist',
 '1222': 'Soluble guanylate cyclase activator',
 '41': 'Muscarinic acetylcholine receptor M3 agonist',
 '1320': 'Androgen Receptor antagonist',
 '1987': 'Sodium channel alpha subunit blocker'}

In [150]:
# label

'606 '

In [165]:
MOAs = []
for index, row in df.iterrows():
    label = row.loc['Label']
    if label not in moa_dict:
        df.drop(index, inplace = True)
    else: 
        MOAs.append(moa_dict[label])

df.loc[:,'MOAs'] = pd.Series(np.array(MOAs), index=df.index)

In [166]:
df.head(15)

Unnamed: 0,Label,ChEMBL ID,Molecule ID,MOAs
1,1664,1664 ;,CHEMBL22,Bacterial dihydrofolate reductase inhibitor
10,340,340 ;,CHEMBL35,Sodium-(potassium)-chloride cotransporter 2 in...
19,1905,1905 ;,CHEMBL5,Bacterial DNA gyrase inhibitor
28,4555,4555 ;,CHEMBL6318,Estrogen receptor modulator
37,902,902 ;,CHEMBL405,Norepinephrine transporter releasing agent
46,687,687 ;,CHEMBL405,Dopamine transporter releasing agent
55,1588,1588 ;,CHEMBL407,GABA-A receptor; anion channel allosteric anta...
66,1222,1222 ;,CHEMBL6622,Soluble guanylate cyclase activator
77,41,41 ;,CHEMBL14,Muscarinic acetylcholine receptor M3 agonist
86,1320,1320 ;,CHEMBL409,Androgen Receptor antagonist


In [130]:
df.shape

(4992, 4)

## Get Molecule Name and from ChEMBL Molecule ID

In [34]:
# moleID_df = df['Molecule ID']
# moleID_df = moleID_df.drop_duplicates()
# moleID_df.shape

(3907,)

In [42]:
# moleID_list = moleID_df.tolist()
# failed_to_get_name = 0
# drug_name_dict = {}
# # count = 0

# for moleID in moleID_list:
# #     count += 1
#     url = 'https://www.ebi.ac.uk/chembl/api/data/molecule/' + moleID + '.json'
#     response = requests.get(url)
#     try:
#         response.json()
#     except ValueError:
#         continue
#     if "molecule_synonyms" in response.json().keys():
#         drug_name = response.json()["molecule_synonyms"][0]['molecule_synonym']
#         drug_name_dict[moleID] = drug_name
#     else:
#         failed_to_get_name +=1
    
#     if len(drug_name_dict) % 100 == 0:
#         print(moleID_list.index(moleID))
# #     if count > 10:
# #         break
# print(failed_to_get_name)
# print(len(drug_name_dict))
# print(len(moleID_list))

99
199
299
399
499
599
699
799
899
999
1099
1199
1299
1399
1499
1599
1699
1799
1899
1999
2099
2199
2299
2399
2499
2599
2699
2799
2899
2999
3099
3199
3299
3399
3499
3599
3699
3799
3899
0
3907
3907


In [40]:
# len(drug_name_dict)

1138

In [43]:
# drug_names = []
# for index, row in df.iterrows():
#     mole_id = row.loc['Molecule ID']
#     if mole_id not in drug_name_dict:
#         df.drop(index, inplace = True)
#     else: 
#         drug_names.append(drug_name_dict[mole_id])

# df.loc[:,'Drug_Name'] = pd.Series(np.array(drug_names), index=df.index)

## Get Pubchem ID from DrugName

In [168]:
namesdf = df['Molecule ID']
namesdf = namesdf.drop_duplicates()
namesdf.head(10)

1       CHEMBL22
10      CHEMBL35
19       CHEMBL5
28    CHEMBL6318
37     CHEMBL405
55     CHEMBL407
66    CHEMBL6622
77      CHEMBL14
86     CHEMBL409
95      CHEMBL16
Name: Molecule ID, dtype: object

In [171]:
nameslist = namesdf.tolist()
failed_to_get_CID = 0
CID_dict = {}

for name in nameslist:
    name = name.replace(' ','%20')
    url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/' + name + '/cids/JSON'
    response = requests.get(url)
    try:
        response.json()
    except ValueError:
        continue
    if 'IdentifierList' in response.json().keys():
        CID = response.json()['IdentifierList']['CID'][0]
        name = name.replace('%20', ' ')
        CID_dict[name] = CID
    else:
        failed_to_get_CID += 1
        
    if len(CID_dict) % 100 == 0:
        name = name.replace('%20',' ')
        print(nameslist.index(name))
print(failed_to_get_CID)
print(len(CID_dict))
print(len(nameslist))

113
232
349
460
561
668
787
788
789
790
948
1122
1247
1358
1479
1681
1807
1808
2097
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2544
2683
2847
3000
3176
3550
3551
3552
3553
3554
3555
3734
3891
3892
1493
2414
3907


In [170]:
CID_dict

{'CHEMBL22': 5578}

In [172]:
CIDs = []
for index, row in df.iterrows():
    drugname = row.loc['Molecule ID']
    if drugname not in CID_dict:
        df.drop(index, inplace = True)
    else: 
        CIDs.append(int(CID_dict[drugname]))

df.loc[:,'CIDs'] = pd.Series(np.array(CIDs), index=df.index)

In [173]:
df.shape

(3201, 5)

## Filter by LINCS Approved Small Molecules

In [174]:
count = 0
for index, row in df.iterrows():
    CID = row.loc['CIDs']
    if CID not in lincs['SM_PubChem_CID'].values:
        df.drop(index, inplace = True)
        count += 1
        
print(count)

2169


In [175]:
df.shape

(1032, 5)

## Make the Binary Matrix

In [176]:
grouped_df = df.groupby(['MOAs'])['CIDs'].apply(lambda x: ','.join(x.astype(str))).reset_index()
grouped_df.set_index('MOAs', inplace=True)
grouped_df = grouped_df.sort_index()
len(df['MOAs'].unique())

409

In [177]:
grouped_matrix = grouped_df.iloc[:,0].str.get_dummies(sep=',')

In [178]:
grouped_matrix.head()

Unnamed: 0_level_0,10026128,10029385,10052040,10074640,10090485,10096344,10117987,10127622,10133,10152654,...,9911830,9915743,9924495,9931954,9933475,9935681,9939609,9949641,9952773,9952884
MOAs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11-beta-hydroxysteroid dehydrogenase inhibitor,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26S proteosome inhibitor,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"3',5'-cyclic phosphodiesterase inhibitor",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
70S ribosome inhibitor,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ALK tyrosine kinase receptor inhibitor,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [179]:
grouped_matrix.shape

(409, 747)

## Save Binary Matrix

In [180]:
filename = 'Output/ChEMBL_MOA_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
grouped_matrix.to_csv(filename, sep='\t', compression='gzip')

## Convert Binary Matrix to GMT and Save as GMT

In [181]:
drugset_library = []
for index, row in grouped_matrix.iterrows():
    drugset = []
    drugset.append(index)
    drugset.append('')
    for i in range(grouped_matrix.shape[1]):
        if row.iloc[i]==1:
            drugset.append(row.index[i])
    if len(drugset) >= 7:
        drugset_library.append(drugset)
dsl = np.array(drugset_library)

In [182]:
len(max(dsl, key=len))

48

In [183]:
filename = 'Output/ChEMBL_MOA_DrugSetLibrary_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')
with open(filename,'w',encoding='utf-8') as f:
    for row in dsl:
        np.savetxt(f, [row], fmt = '%s',delimiter = '\t')