## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os
import requests
import chardet

## Load Data

#### Load DrugCentral Data

In [6]:
df = pd.read_table('Input/drugcentral_targets.tsv')

In [7]:
df.head()

Unnamed: 0,DRUG_NAME,STRUCT_ID,TARGET_NAME,TARGET_CLASS,ACCESSION,GENE,SWISSPROT,ACT_VALUE,ACT_UNIT,ACT_TYPE,ACT_COMMENT,ACT_SOURCE,RELATION,MOA,MOA_SOURCE,ACT_SOURCE_URL,MOA_SOURCE_URL,ACTION_TYPE,TDL,ORGANISM
0,levobupivacaine,4,Sodium channel protein type 4 subunit alpha,Ion channel,P35499,SCN4A,SCN4A_HUMAN,,,,,WOMBAT-PK,,1.0,CHEMBL,,https://www.ebi.ac.uk/chembl/compound/inspect/...,BLOCKER,Tclin,Homo sapiens
1,levobupivacaine,4,Cytochrome P450 2D6,Enzyme,P10635,CYP2D6,CP2D6_HUMAN,6.706859,,IC50,"DRUGMATRIX: CYP450, 2D6 enzyme inhibition (sub...",DRUG MATRIX,=,,,,,,Tclin,Homo sapiens
2,levobupivacaine,4,Potassium voltage-gated channel subfamily H me...,Ion channel,Q12809,KCNH2,KCNH2_HUMAN,4.89,,IC50,Inhibition of wild-type human ERG channel expr...,CHEMBL,=,,,https://www.ebi.ac.uk/chembl/compound/inspect/...,,,Tclin,Homo sapiens
3,levobupivacaine,4,Potassium voltage-gated channel subfamily D me...,Ion channel,Q9UK17,KCND3,KCND3_HUMAN,4.5,,IC50,,WOMBAT-PK,=,,,,,,Tclin,Homo sapiens
4,levobupivacaine,4,Prostaglandin E2 receptor EP1 subtype,GPCR,P34995,PTGER1,PE2R1_HUMAN,,,,,WOMBAT-PK,,,,,,,Tclin,Homo sapiens


In [10]:
df.shape

(16289, 20)

In [9]:
df = df.dropna(subset=['GENE']) 

In [34]:
for index, row in df.iterrows():
    
    print(type(row.values))
    break

<class 'numpy.ndarray'>


#### Load Targets Mapping File

In [11]:
target_update = pd.read_table('Input/mappingFile_2017.txt', names = ['Old Targets','Updated Targets'], engine='python')

In [12]:
target_update.head()

Unnamed: 0,Old Targets,Updated Targets
0,A1BG,A1BG
1,A1BG-AS1,A1BG-AS1
2,NCRNA00181,A1BG-AS1
3,A1BGAS,A1BG-AS1
4,A1BG-AS,A1BG-AS1


#### Load LINCS Small Molecules

In [13]:
lincs = pd.read_csv('Input/LINCS_SmallMolecules.csv',encoding='ISO-8859-1')

In [15]:
lincs.head()

Unnamed: 0,SM_Name,SM_LINCS_ID,SM_Alternative_Name,SM_PubChem_CID,SM_SMILES_Parent,SM_SMILES_Batch,SM_InChi_Parent,SM_Molecular_Mass,MOLECULAR_FORMULA,SM_ChEBI_ID
0,Dichlobenil,LSM-19017,,3031.0,Clc1cccc(Cl)c1C#N,,InChI=1S/C7H3Cl2N/c8-6-2-1-3-7(9)5(6)4-10/h1-3H,172.01,C7H3Cl2N,943.0
1,AC1NWAJC,LSM-43967,Vulpinic acid,5701993.0,COC(=O)\C(=C\1/OC(=O)C(C1=O)c2ccccc2)\c3ccccc3,,InChI=1S/C19H14O5/c1-23-18(21)15(13-10-6-3-7-1...,322.31,C19H14O5,
2,Sinapic Acid Methyl Ether,LSM-44124,,735755.0,COc1cc(\C=C\C(=O)O)cc(OC)c1OC,,InChI=1S/C12H14O5/c1-15-9-6-8(4-5-11(13)14)7-1...,238.24,C12H14O5,
3,Ferulic acid,LSM-44126,Ferulic acid,445858.0,COc1cc(\C=C\C(=O)O)ccc1O,,InChI=1S/C10H10O4/c1-14-9-6-7(2-4-8(9)11)3-5-1...,194.18,C10H10O4,17620.0
4,Pinosylvin Methyl Ether,LSM-43902,,5281719.0,COc1cc(O)cc(\C=C\c2ccccc2)c1,,InChI=1S/C15H14O2/c1-17-15-10-13(9-14(16)11-15...,226.27,C15H14O2,8227.0


## Fix DrugCentral GENE column

In [76]:
how_many = 0
appended_df = []
gene_index = np.where(df.columns.values=='GENE')[0][0]

for index, row in df.iterrows():
    gene_group = row.loc['GENE']
    if '|' in gene_group:
        gene_split = gene_group.split('|')
        for i in gene_split:
            row_as_list = row.values.tolist()
            row_as_list[gene_index] = i
            appended_df.append(row_as_list)
    df.drop(index, inplace = True)

print(appended_df)

[['(S)-nicardipine', 5, 'Voltage-gated L-type calcium channel', 'Ion channel', 'Q01668|Q13936', 'CACNA1C', 'CAC1C_HUMAN|CAC1D_HUMAN', nan, nan, nan, 'Mechanism of Action', 'DRUG LABEL', nan, 1.0, 'DRUG LABEL', 'http://www.accessdata.fda.gov/drugsatfda_docs/label/2009/022276s003lbl.pdf', 'http://www.accessdata.fda.gov/drugsatfda_docs/label/2009/022276s003lbl.pdf', 'BLOCKER', 'Tclin|Tclin', 'Homo sapiens'], ['(S)-nicardipine', 5, 'Voltage-gated L-type calcium channel', 'Ion channel', 'Q01668|Q13936', 'CACNA1D', 'CAC1C_HUMAN|CAC1D_HUMAN', nan, nan, nan, 'Mechanism of Action', 'DRUG LABEL', nan, 1.0, 'DRUG LABEL', 'http://www.accessdata.fda.gov/drugsatfda_docs/label/2009/022276s003lbl.pdf', 'http://www.accessdata.fda.gov/drugsatfda_docs/label/2009/022276s003lbl.pdf', 'BLOCKER', 'Tclin|Tclin', 'Homo sapiens'], ['acamprosate', 38, 'Glutamate [NMDA] receptor', 'Ion channel', 'O15399|O60391|Q05586|Q12879|Q13224|Q14957|Q8TCU5', 'GRIN1', 'NMD3A_HUMAN|NMD3B_HUMAN|NMDE1_HUMAN|NMDE2_HUMAN|NMDE3_HUM

In [77]:
columnnames = list(df.columns.values)
fix_gene_df = pd.DataFrame(appended_df,columns = columnnames)

In [78]:
fix_gene_df.head()

Unnamed: 0,DRUG_NAME,STRUCT_ID,TARGET_NAME,TARGET_CLASS,ACCESSION,GENE,SWISSPROT,ACT_VALUE,ACT_UNIT,ACT_TYPE,ACT_COMMENT,ACT_SOURCE,RELATION,MOA,MOA_SOURCE,ACT_SOURCE_URL,MOA_SOURCE_URL,ACTION_TYPE,TDL,ORGANISM
0,(S)-nicardipine,5,Voltage-gated L-type calcium channel,Ion channel,Q01668|Q13936,CACNA1C,CAC1C_HUMAN|CAC1D_HUMAN,,,,Mechanism of Action,DRUG LABEL,,1.0,DRUG LABEL,http://www.accessdata.fda.gov/drugsatfda_docs/...,http://www.accessdata.fda.gov/drugsatfda_docs/...,BLOCKER,Tclin|Tclin,Homo sapiens
1,(S)-nicardipine,5,Voltage-gated L-type calcium channel,Ion channel,Q01668|Q13936,CACNA1D,CAC1C_HUMAN|CAC1D_HUMAN,,,,Mechanism of Action,DRUG LABEL,,1.0,DRUG LABEL,http://www.accessdata.fda.gov/drugsatfda_docs/...,http://www.accessdata.fda.gov/drugsatfda_docs/...,BLOCKER,Tclin|Tclin,Homo sapiens
2,acamprosate,38,Glutamate [NMDA] receptor,Ion channel,O15399|O60391|Q05586|Q12879|Q13224|Q14957|Q8TCU5,GRIN1,NMD3A_HUMAN|NMD3B_HUMAN|NMDE1_HUMAN|NMDE2_HUMA...,,,,Mechanism of Action; CHEMBL2094124; PROTEIN CO...,CHEMBL,,1.0,CHEMBL,https://www.ebi.ac.uk/chembl/compound/inspect/...,https://www.ebi.ac.uk/chembl/compound/inspect/...,ANTAGONIST,Tclin|Tclin|Tclin|Tclin|Tclin|Tclin|Tclin,Homo sapiens
3,acamprosate,38,Glutamate [NMDA] receptor,Ion channel,O15399|O60391|Q05586|Q12879|Q13224|Q14957|Q8TCU5,GRIN2A,NMD3A_HUMAN|NMD3B_HUMAN|NMDE1_HUMAN|NMDE2_HUMA...,,,,Mechanism of Action; CHEMBL2094124; PROTEIN CO...,CHEMBL,,1.0,CHEMBL,https://www.ebi.ac.uk/chembl/compound/inspect/...,https://www.ebi.ac.uk/chembl/compound/inspect/...,ANTAGONIST,Tclin|Tclin|Tclin|Tclin|Tclin|Tclin|Tclin,Homo sapiens
4,acamprosate,38,Glutamate [NMDA] receptor,Ion channel,O15399|O60391|Q05586|Q12879|Q13224|Q14957|Q8TCU5,GRIN2B,NMD3A_HUMAN|NMD3B_HUMAN|NMDE1_HUMAN|NMDE2_HUMA...,,,,Mechanism of Action; CHEMBL2094124; PROTEIN CO...,CHEMBL,,1.0,CHEMBL,https://www.ebi.ac.uk/chembl/compound/inspect/...,https://www.ebi.ac.uk/chembl/compound/inspect/...,ANTAGONIST,Tclin|Tclin|Tclin|Tclin|Tclin|Tclin|Tclin,Homo sapiens


In [79]:
df.append(fix_gene_df)

Unnamed: 0,DRUG_NAME,STRUCT_ID,TARGET_NAME,TARGET_CLASS,ACCESSION,GENE,SWISSPROT,ACT_VALUE,ACT_UNIT,ACT_TYPE,ACT_COMMENT,ACT_SOURCE,RELATION,MOA,MOA_SOURCE,ACT_SOURCE_URL,MOA_SOURCE_URL,ACTION_TYPE,TDL,ORGANISM
0,(S)-nicardipine,5,Voltage-gated L-type calcium channel,Ion channel,Q01668|Q13936,CACNA1C,CAC1C_HUMAN|CAC1D_HUMAN,,,,Mechanism of Action,DRUG LABEL,,1.0,DRUG LABEL,http://www.accessdata.fda.gov/drugsatfda_docs/...,http://www.accessdata.fda.gov/drugsatfda_docs/...,BLOCKER,Tclin|Tclin,Homo sapiens
1,(S)-nicardipine,5,Voltage-gated L-type calcium channel,Ion channel,Q01668|Q13936,CACNA1D,CAC1C_HUMAN|CAC1D_HUMAN,,,,Mechanism of Action,DRUG LABEL,,1.0,DRUG LABEL,http://www.accessdata.fda.gov/drugsatfda_docs/...,http://www.accessdata.fda.gov/drugsatfda_docs/...,BLOCKER,Tclin|Tclin,Homo sapiens
2,acamprosate,38,Glutamate [NMDA] receptor,Ion channel,O15399|O60391|Q05586|Q12879|Q13224|Q14957|Q8TCU5,GRIN1,NMD3A_HUMAN|NMD3B_HUMAN|NMDE1_HUMAN|NMDE2_HUMA...,,,,Mechanism of Action; CHEMBL2094124; PROTEIN CO...,CHEMBL,,1.0,CHEMBL,https://www.ebi.ac.uk/chembl/compound/inspect/...,https://www.ebi.ac.uk/chembl/compound/inspect/...,ANTAGONIST,Tclin|Tclin|Tclin|Tclin|Tclin|Tclin|Tclin,Homo sapiens
3,acamprosate,38,Glutamate [NMDA] receptor,Ion channel,O15399|O60391|Q05586|Q12879|Q13224|Q14957|Q8TCU5,GRIN2A,NMD3A_HUMAN|NMD3B_HUMAN|NMDE1_HUMAN|NMDE2_HUMA...,,,,Mechanism of Action; CHEMBL2094124; PROTEIN CO...,CHEMBL,,1.0,CHEMBL,https://www.ebi.ac.uk/chembl/compound/inspect/...,https://www.ebi.ac.uk/chembl/compound/inspect/...,ANTAGONIST,Tclin|Tclin|Tclin|Tclin|Tclin|Tclin|Tclin,Homo sapiens
4,acamprosate,38,Glutamate [NMDA] receptor,Ion channel,O15399|O60391|Q05586|Q12879|Q13224|Q14957|Q8TCU5,GRIN2B,NMD3A_HUMAN|NMD3B_HUMAN|NMDE1_HUMAN|NMDE2_HUMA...,,,,Mechanism of Action; CHEMBL2094124; PROTEIN CO...,CHEMBL,,1.0,CHEMBL,https://www.ebi.ac.uk/chembl/compound/inspect/...,https://www.ebi.ac.uk/chembl/compound/inspect/...,ANTAGONIST,Tclin|Tclin|Tclin|Tclin|Tclin|Tclin|Tclin,Homo sapiens
5,acamprosate,38,Glutamate [NMDA] receptor,Ion channel,O15399|O60391|Q05586|Q12879|Q13224|Q14957|Q8TCU5,GRIN2C,NMD3A_HUMAN|NMD3B_HUMAN|NMDE1_HUMAN|NMDE2_HUMA...,,,,Mechanism of Action; CHEMBL2094124; PROTEIN CO...,CHEMBL,,1.0,CHEMBL,https://www.ebi.ac.uk/chembl/compound/inspect/...,https://www.ebi.ac.uk/chembl/compound/inspect/...,ANTAGONIST,Tclin|Tclin|Tclin|Tclin|Tclin|Tclin|Tclin,Homo sapiens
6,acamprosate,38,Glutamate [NMDA] receptor,Ion channel,O15399|O60391|Q05586|Q12879|Q13224|Q14957|Q8TCU5,GRIN2D,NMD3A_HUMAN|NMD3B_HUMAN|NMDE1_HUMAN|NMDE2_HUMA...,,,,Mechanism of Action; CHEMBL2094124; PROTEIN CO...,CHEMBL,,1.0,CHEMBL,https://www.ebi.ac.uk/chembl/compound/inspect/...,https://www.ebi.ac.uk/chembl/compound/inspect/...,ANTAGONIST,Tclin|Tclin|Tclin|Tclin|Tclin|Tclin|Tclin,Homo sapiens
7,acamprosate,38,Glutamate [NMDA] receptor,Ion channel,O15399|O60391|Q05586|Q12879|Q13224|Q14957|Q8TCU5,GRIN3A,NMD3A_HUMAN|NMD3B_HUMAN|NMDE1_HUMAN|NMDE2_HUMA...,,,,Mechanism of Action; CHEMBL2094124; PROTEIN CO...,CHEMBL,,1.0,CHEMBL,https://www.ebi.ac.uk/chembl/compound/inspect/...,https://www.ebi.ac.uk/chembl/compound/inspect/...,ANTAGONIST,Tclin|Tclin|Tclin|Tclin|Tclin|Tclin|Tclin,Homo sapiens
8,acamprosate,38,Glutamate [NMDA] receptor,Ion channel,O15399|O60391|Q05586|Q12879|Q13224|Q14957|Q8TCU5,GRIN3B,NMD3A_HUMAN|NMD3B_HUMAN|NMDE1_HUMAN|NMDE2_HUMA...,,,,Mechanism of Action; CHEMBL2094124; PROTEIN CO...,CHEMBL,,1.0,CHEMBL,https://www.ebi.ac.uk/chembl/compound/inspect/...,https://www.ebi.ac.uk/chembl/compound/inspect/...,ANTAGONIST,Tclin|Tclin|Tclin|Tclin|Tclin|Tclin|Tclin,Homo sapiens
9,acamprosate,38,GABA-A receptor alpha-1/beta-3/gamma-2,Ion channel,P14867|P18507|P28472,GABRA1,GBRA1_HUMAN|GBRB3_HUMAN|GBRG2_HUMAN,,,,,WOMBAT-PK,,1.0,CHEMBL,,https://www.ebi.ac.uk/chembl/compound/inspect/...,POSITIVE MODULATOR,Tclin|Tclin|Tclin,Homo sapiens
