## Import Libraries

In [90]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os
import requests
import chardet

## Load Data

#### Load DrugGeneInteract Data

In [179]:
df = pd.read_table('Input/DrugGeneinteractions.tsv')

In [199]:
df.head()

Unnamed: 0,Old Targets,gene_claim_name,entrez_id,interaction_claim_source,interaction_types,drug_name,drug_claim_primary_name,drug_claim_name,drug_chembl_id,PMIDs,Targets
0,CDK7,CDK7,1022.0,CancerCommons,inhibitor,BMS-387032,SNS-032,SNS-032,CHEMBL296468,,CDK7
1,ADORA2A,19,135.0,GuideToPharmacologyInteractions,antagonist,CHEMBL72862,MRS1041,392,CHEMBL72862,,ADORA2A
2,FGFR2,FGFR2,2263.0,CKB,,,AZ6089,AZ6089,,22869148.0,FGFR2
3,APOE,APOE,348.0,NCI,,PREDNISONE,PREDNISONE,PREDNISONE,CHEMBL635,3185288.0,APOE
4,CYP2E1,CYP2E1,1571.0,NCI,,ISOFLAVONE,ISOFLAVONE,ISOFLAVONE,CHEMBL366460,15056880.0,CYP2E1


In [167]:
df.shape

(42727, 10)

In [197]:
df.columns

Index(['Old Targets', 'gene_claim_name', 'entrez_id',
       'interaction_claim_source', 'interaction_types', 'drug_claim_name',
       'drug_claim_primary_name', 'drug_name', 'drug_chembl_id', 'PMIDs',
       'Targets'],
      dtype='object')

In [198]:
df = df[['Old Targets', 'gene_claim_name', 'entrez_id',
       'interaction_claim_source', 'interaction_types', 'drug_name','drug_claim_primary_name',
       'drug_claim_name', 'drug_chembl_id', 'PMIDs',
       'Targets']]

#### Load TTD to Gene File

In [168]:
ttd_gene = pd.read_table('Input/TTD_to_Gene_2018_06.tsv')

In [169]:
ttd_gene.head()

Unnamed: 0.1,Unnamed: 0,UniProt ID,TTD Target ID,Target Name,Target Type,Gene
0,0,P11229,TTDS00002,Muscarinic acetylcholine receptor M1,Successful target,CHRM1
1,1,P08172,TTDS00003,Muscarinic acetylcholine receptor M2,Successful target,CHRM2
2,2,P20309,TTDS00004,Muscarinic acetylcholine receptor M3,Successful target,CHRM3
3,3,P08173,TTDS00005,Muscarinic acetylcholine receptor M4,Successful target,CHRM4
4,4,P08912,TTDS00006,Muscarinic acetylcholine receptor M5,Successful target,CHRM5


In [170]:
ttd_gene.set_index('TTD Target ID', inplace = True)

#### Load Targets Mapping File

In [171]:
target_update = pd.read_table('Input/mappingFile_2017.txt', names = ['Old Targets','Updated Targets'], engine='python')

In [172]:
target_update.head()

Unnamed: 0,Old Targets,Updated Targets
0,A1BG,A1BG
1,A1BG-AS1,A1BG-AS1
2,NCRNA00181,A1BG-AS1
3,A1BGAS,A1BG-AS1
4,A1BG-AS,A1BG-AS1


#### Load LINCS Small Molecules

In [173]:
lincs = pd.read_csv('Input/LINCS_SmallMolecules.csv',encoding='ISO-8859-1')

In [174]:
lincs.head()

Unnamed: 0,SM_Name,SM_LINCS_ID,SM_Alternative_Name,SM_PubChem_CID,SM_SMILES_Parent,SM_SMILES_Batch,SM_InChi_Parent,SM_Molecular_Mass,MOLECULAR_FORMULA,SM_ChEBI_ID
0,Dichlobenil,LSM-19017,,3031.0,Clc1cccc(Cl)c1C#N,,InChI=1S/C7H3Cl2N/c8-6-2-1-3-7(9)5(6)4-10/h1-3H,172.01,C7H3Cl2N,943.0
1,AC1NWAJC,LSM-43967,Vulpinic acid,5701993.0,COC(=O)\C(=C\1/OC(=O)C(C1=O)c2ccccc2)\c3ccccc3,,InChI=1S/C19H14O5/c1-23-18(21)15(13-10-6-3-7-1...,322.31,C19H14O5,
2,Sinapic Acid Methyl Ether,LSM-44124,,735755.0,COc1cc(\C=C\C(=O)O)cc(OC)c1OC,,InChI=1S/C12H14O5/c1-15-9-6-8(4-5-11(13)14)7-1...,238.24,C12H14O5,
3,Ferulic acid,LSM-44126,Ferulic acid,445858.0,COc1cc(\C=C\C(=O)O)ccc1O,,InChI=1S/C10H10O4/c1-14-9-6-7(2-4-8(9)11)3-5-1...,194.18,C10H10O4,17620.0
4,Pinosylvin Methyl Ether,LSM-43902,,5281719.0,COc1cc(O)cc(\C=C\c2ccccc2)c1,,InChI=1S/C15H14O2/c1-17-15-10-13(9-14(16)11-15...,226.27,C15H14O2,8227.0


## Get/Correct Gene Names

In [181]:
for index, row in df.iterrows():
    col_1 = row.loc['gene_name']
    col_2 = row.loc['gene_claim_name']
    if pd.isna(col_1) and not pd.isna(col_2):
        if col_2 in ttd_gene.index:
            df.at[index,'gene_name'] =  str(ttd_gene.loc[col_2,'Gene'])
        else:
            df.at[index,'gene_name'] =  str(col_2)  
    elif pd.isna(col_1) and pd.isna(col_2):
        df.drop(index, inplace = True)    


## Update Target Names

In [182]:
target_update.set_index('Old Targets',inplace = True)

In [183]:
df = df.rename(index=str, columns = {'gene_name':'Old Targets'})
df.set_index('Old Targets', inplace = True)

In [184]:
df.index[676] in target_update.index

True

In [185]:
df = pd.merge(df, target_update, how = 'left', on = 'Old Targets')

(42665, 10)

In [189]:
df.shape

(40925, 11)

In [188]:
df.reset_index(inplace=True)
df = df.rename(index=str, columns = {'Updated Targets':'Targets'})
df = df.dropna(subset=['Targets']) 

## Map to drugname to PubChem IDs

In [286]:
CIDs = []
failed_to_get_CID = 0
CID_dict = {}
count = 0

for index, row in df.iterrows():
#     print(index,row)
    appended_to_CID = False
    count += 1
    for poss_drug_name in row.loc['drug_name':'drug_chembl_id']:
#         print(poss_drug_name)
        poss_drug_name = str(poss_drug_name)
        #check the memo if the drug name was already found
        if poss_drug_name in CID_dict:
            CIDs.append(CID_dict[poss_drug_name])
            appended_to_CID = True
            break
        elif poss_drug_name == 'nan':
            continue
        else:
            poss_drug_name = poss_drug_name.replace(' ','%20')
#             poss_drug_name = 'BERTILIMUMAB??'
            url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/' + poss_drug_name + '/cids/JSON'
            response = requests.get(url)
            try:                 
                response.json()
            except ValueError:
                continue
            if 'IdentifierList' in response.json().keys():
                CID = response.json()['IdentifierList']['CID'][0]
                poss_drug_name = poss_drug_name.replace('%20', ' ')
                CID_dict[poss_drug_name] = int(CID)
                CIDs.append(int(CID))
                appended_to_CID = True
                break
    if not appended_to_CID:
        failed_to_get_CID += 1
        df.drop(index, inplace = True)
#     if count >=20:
#         break
    
    if count % 100 == 0:
        print(index)

print(failed_to_get_CID)
print(len(CIDs))


138
267
393
530
653
781
899
1023
1165
1287
1421
1549
1686
1802
1932
2055
2183
2311
2437
2569
2696
2819
2947
3062
3183
3306
3429
3557
3682
3813
3949
4073
4196
4321
4450
4582
4708
4832
4957
5083
5210
5345
5464
5584
5712
5840
5968
6097
6208
6342
6468
6586
6721
6842
6963
7089
7217
7345
7467
7603
7728
7846
7969
8104
8228
8356
8478
8614
8742
8871
8985
9105
9232
9368
9504
9643
9768
9888
10023
10149
10286
10416
10540
10666
10795
10918
11046
11175
11307
11426
11548
11675
11807
11939
12054
12176
12303
12431
12555
12684
12809
12941
13077
13204
13331
13449
13575
13698
13829
13944
14069
14200
14337
14463
14597
14722
14857
14979
15100
15227
15358
15487
15616
15736
15870
15995
16099
16204
16308
16414
16517
16619
16724
16829
16934
17038
17146
17251
17358
17460
17563
17670
17771
17876
17982
18086
18189
18293
18402
18508
18612
18718
18819
18925
19030
19134
19240
19348
19452
19554
19656
19762
19870
19975
20076
20179
20279
20384
20492
20597
20703
20804
20906
21014
21114
21215
21317
21422
21524
21627
21732

In [275]:
index

'125'

In [288]:
df.shape

(31413, 11)

In [289]:
df.loc[:,'CIDs'] = pd.Series(np.array(CIDs), index=df.index)

## Filter by LINCS approved Small Molecules

In [294]:
count = 0
for index, row in df.iterrows():
    CID = row.loc['CIDs']
    if CID not in lincs['SM_PubChem_CID'].values:
        df.drop(index, inplace = True)
        count += 1
        
print(count)

15484


In [295]:
df.shape

(15929, 12)

## Make Binary Matrix

In [296]:
grouped_df = df.groupby(['Targets'])['CIDs'].apply(lambda x: ','.join(x.astype(str))).reset_index()

In [297]:
grouped_df.set_index('Targets', inplace=True)
grouped_df = grouped_df.sort_index()
len(df['Targets'].unique())

1901

In [298]:
grouped_matrix = grouped_df.iloc[:,0].str.get_dummies(sep=',')

In [299]:
grouped_matrix.head()

Unnamed: 0_level_0,10000456,10052040,10071196,10074640,10077147,10090485,10096344,10113978,10117987,10127622,...,9934458,9935681,9939609,9941444,9949641,9950038,9952884,9956119,9957280,9966051
Targets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A2M,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AANAT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABAT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABCA1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABCB1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [300]:
grouped_matrix.shape

(1901, 1615)

## Save Binary Matrix

In [301]:
filename = 'Output/DrugGeneInteract_Targets_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
grouped_matrix.to_csv(filename, sep='\t', compression='gzip')

## Convert Binary Matrix to GMT and Save as GMT

In [302]:
drugset_library = []
for index, row in grouped_matrix.iterrows():
    drugset = []
    drugset.append(index)
    drugset.append('')
    for i in range(grouped_matrix.shape[1]):
        if row.iloc[i]==1:
            drugset.append(row.index[i])
    if len(drugset) >= 7:
        drugset_library.append(drugset)
dsl = np.array(drugset_library)

In [303]:
len(max(dsl, key=len))

97

In [304]:
filename = 'Output/DrugCentral_Targets_DrugSetLibrary_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')
with open(filename,'w',encoding='utf-8') as f:
    for row in dsl:
        np.savetxt(f, [row], fmt = '%s',delimiter = '\t')   

In [305]:
# drug_claim_name = df['drug_claim_name']
# drug_claim_pri_name = df['drug_claim_primary_name']
# drug_name = df['drug_name']
# drug_chembl_id = df['drug_chembl_id']
# drug_name = drug_name.append(drug_claim_name)
# drug_name = drug_name.append(drug_claim_pri_name)
# drug_name = drug_name.append(drug_chembl_id)
# namesdf = drug_name
# namesdf = namesdf.drop_duplicates()
# namesdf.dropna()
# namesdf.shape

# nameslist = namesdf.tolist()
# failed_to_get_CID = 0
# CID_dict = {}

# for name in nameslist:
#     name = str(name)
#     name = name.replace(' ','%20')
#     url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/' + name + '/cids/JSON'
#     response = requests.get(url)
#     if 'IdentifierList' in response.json().keys():
#         CID = response.json()['IdentifierList']['CID'][0]
#         name = name.replace('%20', ' ')
#         CID_dict[name] = CID
#     else:
#         failed_to_get_CID += 1
        
#     if len(CID_dict) % 100 == 0:
#         name = name.replace('%20',' ')
#         print(nameslist.index(name))

# print(failed_to_get_CID)
# print(len(CID_dict))
# print(len(nameslist))

10007
