# Approved_Drugs_MACCS_KEYS

Author: Moshe Silverstein  
Date: 05-2018

Data Source Home:
* Drugbank: https://www.drugbank.ca/
* Drug Repurposing Hub: https://clue.io/repurposing
* DrugCentral: http://drugcentral.org/

Data Source Download:
* Drugbank: https://www.drugbank.ca/releases/latest
* Drug Repurposing Hub: https://clue.io/repurposing
* DrugCentral: http://drugcentral.org/download

# Import Libraries

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import utility_functions as uf
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter

In [None]:
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit import DataStructs
import requests

In [None]:
from clustergrammer_widget import *
net = Network(clustergrammer_widget)

In [None]:
importlib.reload(uf)

# Versions Of Modules In Use

In [5]:
%load_ext version_information
%version_information numpy, pandas, clustergrammer_widget, rdkit

Software,Version
Python,3.5.2 64bit [GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)]
IPython,5.3.0
OS,Darwin 17.5.0 x86_64 i386 64bit
numpy,1.13.1
pandas,0.21.0
clustergrammer_widget,1.9.0
rdkit,2017.09.3
Wed May 02 11:34:13 2018 EDT,Wed May 02 11:34:13 2018 EDT


# Path to Output Files

In [339]:
path = '/Users/moshesilverstein/Documents/Drug Harmonizome/Output/'

# Load Data

### Drugbank

In [7]:
data = Chem.SDMolSupplier('Input/structures.sdf')

### Get List of Molecules From Data

In [8]:
Molecules = [x for x in data]

In [9]:
print(len(Molecules))

2327


### Get Molecule PubChem ID and MACCS Keys

In [10]:
drugs = []
fingerprints = []
namesList = []
synonyms = []
FailedToGetSMILE = 0
FailedToGetID = 0
MoleIsNone = 0

for i,mol in enumerate(Molecules):
    
    progressPercent = ((i+1)/len(Molecules))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(Molecules)))
    sys.stdout.flush()
    
    if mol is None:
        MoleIsNone += 1
        continue
#     print(mol.GetProp('SMILES'))
#     print(Chem.MolFromSmiles(mol.GetProp('SMILES')))
#     print(list(MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(mol.GetProp('SMILES'))).GetOnBits()))
    if Chem.MolFromSmiles(mol.GetProp('SMILES')):
        
        maccs = list(MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(mol.GetProp('SMILES'))).GetOnBits())
        maccs = ['MACCS_'+str(x) for x in maccs]
        
        if 'SYNONYMS' in mol.GetPropsAsDict().keys():
            name = mol.GetProp('GENERIC_NAME')
            names = mol.GetProp('SYNONYMS').split(';')
            names = [x.replace(' ', '') for x in names]
            names.insert(0, name)
            for name in names:
                name = name.replace(' ', '%20')
                url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/'+name+'/cids/JSON'
                response = requests.post(url)
                if 'PC_Compounds' in response.json().keys():
                    ID = response.json()['PC_Compounds'][0]['id']['id']['cid']
                    IDs = [ID]*len(maccs)
                    drugs.append(IDs)
                    fingerprints.append(maccs)
                    namesList.append([names[0]]*len(mccs))
                    synonyms.append([(':').join(names[1:])]*len(maccs))
                elif 'IdentifierList' in response.json().keys():
                    ID = response.json()['IdentifierList']['CID'][0]
                    IDs = [ID]*len(maccs)
                    drugs.append(IDs)
                    fingerprints.append(maccs)
                    namesList.append([names[0]]*len(maccs))
                    synonyms.append([(':').join(names[1:])]*len(maccs))
                elif name == names[-1]:
                    FailedToGetID += 1
            
        else:
            name = mol.GetProp('GENERIC_NAME')
            name = name.replace(' ', '%20')
            url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/'+name+'/cids/JSON'
            response = requests.post(url)
            if 'PC_Compounds' in response.json().keys():
                ID = response.json()['PC_Compounds'][0]['id']['id']['cid']
                IDs = [ID]*len(maccs)
                drugs.append(IDs)
                fingerprints.append(maccs)
                namesList.append([names[0]]*len(mccs))
                synonyms.append([(':').join(names[1:])]*len(maccs))
            elif 'IdentifierList' in response.json().keys():
                ID = response.json()['IdentifierList']['CID'][0]
                IDs = [ID]*len(maccs)
                drugs.append(IDs)
                fingerprints.append(maccs)
                namesList.append([names[0]]*len(maccs))
                synonyms.append([(':').join(names[1:])]*len(maccs))
            elif name == names[-1]:
                FailedToGetID += 1
    else:
        FailedToGetSMILE += 1

drugs = [item for sublist in drugs for item in sublist]
fingerprints = [item for sublist in fingerprints for item in sublist]
namesList = [item for sublist in namesList for item in sublist]
synonyms = [item for sublist in synonyms for item in sublist]
df = pd.DataFrame(columns=['Drug ID', 'Drug Name', 'Drug Synonyms', 'Fingerprint'])
df['Drug ID'] = drugs
df['Drug Name'] = namesList
df['Drug Synonyms'] = synonyms
df['Fingerprint'] = fingerprints


print('Number of drugs with data not found: %r' % MoleIsNone)
print('Number of drugs with SMILE data not found: %r' % FailedToGetSMILE)
print('Number of drugs with names that not matched to ids: %r' % FailedToGetID)

Number of drugs with data not found: 2
Number of drugs with SMILE data not found: 3
Number of drugs with names that not matched to ids: 552


In [11]:
df = df.drop_duplicates(keep='first')

In [12]:
df.head()

Unnamed: 0,Drug ID,Drug Name,Drug Synonyms,Fingerprint
0,16129704,Bivalirudin,Bivalirudina:Bivalirudinum:Hirulog,MACCS_25
1,16129704,Bivalirudin,Bivalirudina:Bivalirudinum:Hirulog,MACCS_43
2,16129704,Bivalirudin,Bivalirudina:Bivalirudinum:Hirulog,MACCS_53
3,16129704,Bivalirudin,Bivalirudina:Bivalirudinum:Hirulog,MACCS_54
4,16129704,Bivalirudin,Bivalirudina:Bivalirudinum:Hirulog,MACCS_74


In [13]:
df.shape

(104635, 4)

In [14]:
drugbank_df = df.copy()

In [55]:
lst = []

for index in drugbank_df.index:
    lst.append(drugbank_df.loc[index, 'Drug Name'].upper())
    
drugbank_df['Drug Name'] = lst

In [57]:
lst = []

for index in drugbank_df.index:
    lst.append(drugbank_df.loc[index, 'Drug Synonyms'].upper())
    
drugbank_df['Drug Synonyms'] = lst

In [58]:
drugbank_df.head()

Unnamed: 0,Drug ID,Drug Name,Drug Synonyms,Fingerprint
0,16129704,BIVALIRUDIN,BIVALIRUDINA:BIVALIRUDINUM:HIRULOG,MACCS_25
1,16129704,BIVALIRUDIN,BIVALIRUDINA:BIVALIRUDINUM:HIRULOG,MACCS_43
2,16129704,BIVALIRUDIN,BIVALIRUDINA:BIVALIRUDINUM:HIRULOG,MACCS_53
3,16129704,BIVALIRUDIN,BIVALIRUDINA:BIVALIRUDINUM:HIRULOG,MACCS_54
4,16129704,BIVALIRUDIN,BIVALIRUDINA:BIVALIRUDINUM:HIRULOG,MACCS_74


In [15]:
print(len(drugbank_df['Drug Name'].unique()))

1673


### DrugCentral

In [207]:
data = pd.read_csv('Input/dc_structures.csv')

In [208]:
data.head()

Unnamed: 0,cd_id,cd_formula,cd_molweight,id,clogp,alogs,cas_reg_no,tpsa,rng_aliph,rng_arom,...,zwitterion_8,molimg,o_n,oh_nh,inchi,nostereo_inchi,smiles,rgb,fda_labels,inchikey
0,5022,C11H14N4O4,266.2533,5229,-2.467,-1.78,209799-67-7,129.97,1.0,2.0,...,0.0,\x89504e470d0a1a0a0000000d49484452000003200000...,8.0,6.0,InChI=1S/C11H14N4O4/c16-2-5-9(17)10(18)7(15-5)...,InChI=1S/C11H14N4O4/c16-2-5-9(17)10(18)7(15-5)...,OC[C@H]1N[C@H]([C@H](O)[C@@H]1O)C1=CNC2=C1NC=N...,15.0,,IWKXDMQDITUYRK-KUBHLMPHSA-N
1,5025,C35H30N4O4,570.6371,5231,5.491,-4.56,120685-11-2,77.73,3.0,6.0,...,0.0,\x89504e470d0a1a0a0000000d49484452000003200000...,8.0,1.0,InChI=1S/C35H30N4O4/c1-35-32(42-3)25(37(2)34(4...,InChI=1S/C35H30N4O4/c1-35-32(42-3)25(37(2)34(4...,CO[C@@H]1[C@@H](C[C@H]2O[C@]1(C)N1C3=CC=CC=C3C...,45.0,1.0,BMGQWWVMWDBQGC-IIFHNQTCSA-N
2,4993,C16H17N7O2S,371.417,5202,0.41,-3.02,1187594-09-7,120.56,1.0,3.0,...,0.0,\x89504e470d0a1a0a0000000d49484452000003200000...,9.0,1.0,"InChI=1S/C16H17N7O2S/c1-2-26(24,25)22-9-16(10-...","InChI=1S/C16H17N7O2S/c1-2-26(24,25)22-9-16(10-...",CCS(=O)(=O)N1CC(CC#N)(C1)N1C=C(C=N1)C1=NC=NC2=...,23.0,,XUZMWHLSFXCVMG-UHFFFAOYSA-N
3,4994,C19H18FN3O,323.3641,5203,3.006,-4.45,283173-50-2,56.92,1.0,3.0,...,0.0,\x89504e470d0a1a0a0000000d49484452000003200000...,4.0,3.0,InChI=1S/C19H18FN3O/c1-21-10-11-2-4-12(5-3-11)...,InChI=1S/C19H18FN3O/c1-21-10-11-2-4-12(5-3-11)...,CNCC1=CC=C(C=C1)C1=C2CCNC(=O)C3=CC(F)=CC(N1)=C23,22.0,1.0,HMABYWSNWIZPAG-UHFFFAOYSA-N
4,595,C13H14N2O,214.2631,2151,1.92,-2.36,553-69-5,45.15,0.0,2.0,...,0.0,\x89504e470d0a1a0a0000000d49484452000003200000...,3.0,2.0,InChI=1S/C13H14N2O/c16-12(11-6-2-1-3-7-11)10-1...,InChI=1S/C13H14N2O/c16-12(11-6-2-1-3-7-11)10-1...,OC(CNC1=CC=CC=N1)C1=CC=CC=C1,12.0,,ZEAJXCPGHPJVNP-UHFFFAOYSA-N


In [209]:
data.shape

(4509, 57)

### Get Approval Data

In [211]:
approval_df = pd.read_csv('Input/dc_approval.csv')

In [212]:
approval_df.head()

Unnamed: 0,id,struct_id,approval,type,applicant,orphan
0,3578,5204,2009-06-30,FDA,AMAG PHARMS INC,
1,3579,5200,,FDA,,
2,3580,5198,,FDA,,
3,3581,5229,2017-03-30,PMDA,Mundi Pharma,
4,3582,5231,2017-04-28,FDA,NOVARTIS PHARMS CORP,


In [221]:
approval_df.shape

(2594, 6)

In [222]:
print(len(data['id'].unique()))
print(len(set(data['id'].values.tolist()).intersection(set(approval_df['struct_id'].values.tolist()))))

4509
2390


In [234]:
approval_df = approval_df[approval_df['type'] == 'FDA']

In [235]:
approval_df.shape

(2072, 6)

In [236]:
data.set_index('id', inplace=True)

In [237]:
data = data.loc[approval_df['struct_id']]

In [238]:
data.shape

(2072, 56)

In [241]:
data.reset_index(inplace=True)

In [242]:
data.head()

Unnamed: 0,id,cd_id,cd_formula,cd_molweight,clogp,alogs,cas_reg_no,tpsa,rng_aliph,rng_arom,...,zwitterion_8,molimg,o_n,oh_nh,inchi,nostereo_inchi,smiles,rgb,fda_labels,inchikey
0,5204,4995,,,,,722492-56-0,0.0,0.0,0.0,...,,,,,,,,,1.0,
1,5200,4991,C4H4Na2O4,162.0517,,,150-90-3,80.26,0.0,0.0,...,,,,,,,,,,
2,5198,4989,C130H220N44O41,3055.408,,,17034-35-4,1421.45,0.0,2.0,...,0.0,\x89504e470d0a1a0a0000000d49484452000003200000...,85.0,52.0,InChI=1S/C130H220N44O41/c1-59(2)41-79(119(208)...,InChI=1S/C130H220N44O41/c1-59(2)41-79(119(208)...,CC(C)C[C@H](NC(=O)CNC(=O)[C@H](CCC(N)=O)NC(=O)...,77.0,,JWQZOTGHUDZFMU-WIDFLDSMSA-N
3,5231,5025,C35H30N4O4,570.6371,5.491,-4.56,120685-11-2,77.73,3.0,6.0,...,0.0,\x89504e470d0a1a0a0000000d49484452000003200000...,8.0,1.0,InChI=1S/C35H30N4O4/c1-35-32(42-3)25(37(2)34(4...,InChI=1S/C35H30N4O4/c1-35-32(42-3)25(37(2)34(4...,CO[C@@H]1[C@@H](C[C@H]2O[C@]1(C)N1C3=CC=CC=C3C...,45.0,1.0,BMGQWWVMWDBQGC-IIFHNQTCSA-N
4,5230,5023,,,,,151662-36-1,0.0,0.0,0.0,...,,,,,,,,,1.0,


In [263]:
name_synonyms = pd.read_csv('Input/dc_synonyms.csv')

In [265]:
name_synonyms.set_index('id', inplace=True)

In [266]:
name_synonyms.head()

Unnamed: 0_level_0,syn_id,name,preferred_name,parent_id,lname
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5249.0,22365,DW-224a,,,dw-224a
667.0,22366,mavenclad,,,mavenclad
2210.0,22226,piroxicam olamine,,,piroxicam olamine
,22227,Blood-coagulation factor VIII,1.0,221.0,blood-coagulation factor viii
224.0,22229,antazoline mesylate,,,antazoline mesylate


In [269]:
name_synonyms.loc[5232.0, 'name'].values.tolist()

['abaloparatide', 'tymlos', 'BA058', 'BIM-44058']

### Get Molecule PubChem ID and MACCS Keys

In [279]:
drugs = []
fingerprints = []
namesList = []
synonyms = []
FailedToGetSMILE = 0
FailedToGetID = 0
MoleIsNone = 0

for i,index in enumerate(data.index):
    
    progressPercent = ((i+1)/len(data.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(data.index)))
    sys.stdout.flush()
           
    if type(data.loc[index, 'smiles']) != float:
        maccs = list(MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(data.loc[index, 'smiles'])).GetOnBits())
        maccs = ['MACCS_'+str(x) for x in maccs]
        
        if type(name_synonyms.loc[float(data.loc[index, 'id']), 'name']) == str:
            names = name_synonyms.loc[float(data.loc[index, 'id']), 'name']
        else:
            names = name_synonyms.loc[float(data.loc[index, 'id']), 'name'].values.tolist()
        
        for name in names:
            name = name.replace(' ', '%20')
            url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/'+name+'/cids/JSON'
            response = requests.post(url)
            if 'PC_Compounds' in response.json().keys():
                ID = response.json()['PC_Compounds'][0]['id']['id']['cid']
                IDs = [ID]*len(maccs)
                drugs.append(IDs)
                fingerprints.append(maccs)
                namesList.append([names[0]]*len(mccs))
                synonyms.append([(':').join(names[1:])]*len(maccs))
            elif 'IdentifierList' in response.json().keys():
                ID = response.json()['IdentifierList']['CID'][0]
                IDs = [ID]*len(maccs)
                drugs.append(IDs)
                fingerprints.append(maccs)
                namesList.append([names[0]]*len(maccs))
                synonyms.append([(':').join(names[1:])]*len(maccs))
            elif name == names[-1]:
                FailedToGetID += 1
    else:
        FailedToGetSMILE += 1



drugs = [item for sublist in drugs for item in sublist]
fingerprints = [item for sublist in fingerprints for item in sublist]
namesList = [item for sublist in namesList for item in sublist]
synonyms = [item for sublist in synonyms for item in sublist]
df = pd.DataFrame(columns=['Drug ID', 'Drug Name', 'Drug Synonyms', 'Fingerprint'])
df['Drug ID'] = drugs
df['Drug Name'] = namesList
df['Drug Synonyms'] = synonyms
df['Fingerprint'] = fingerprints


print('Number of drugs with data not found: %r' % MoleIsNone)
print('Number of drugs with SMILE data not found: %r' % FailedToGetSMILE)
print('Number of drugs with names that not matched to ids: %r' % FailedToGetID)

Number of drugs with data not found: 0
Number of drugs with SMILE data not found: 359
Number of drugs with names that not matched to ids: 95


In [280]:
df.shape

(388586, 4)

In [281]:
drugcentral_df = df.copy()

In [282]:
lst = []

for index in drugcentral_df.index:
    lst.append(drugcentral_df.loc[index, 'Drug Name'].upper())
    
drugcentral_df['Drug Name'] = lst

In [283]:
lst = []

for index in drugcentral_df.index:
    lst.append(drugcentral_df.loc[index, 'Drug Synonyms'].upper())
    
drugcentral_df['Drug Synonyms'] = lst

In [284]:
drugcentral_df.head()

Unnamed: 0,Drug ID,Drug Name,Drug Synonyms,Fingerprint
0,16129665,SECREFLO,SECRETIN PORCINE:SECRETIN SYNTHETIC PORCINE,MACCS_25
1,16129665,SECREFLO,SECRETIN PORCINE:SECRETIN SYNTHETIC PORCINE,MACCS_43
2,16129665,SECREFLO,SECRETIN PORCINE:SECRETIN SYNTHETIC PORCINE,MACCS_53
3,16129665,SECREFLO,SECRETIN PORCINE:SECRETIN SYNTHETIC PORCINE,MACCS_54
4,16129665,SECREFLO,SECRETIN PORCINE:SECRETIN SYNTHETIC PORCINE,MACCS_65


In [285]:
print(len(drugcentral_df['Drug Name'].unique()))

1688


### Drug Repurposing Hub 

In [25]:
data = pd.read_csv('Input/repurposing_samples_20170327.txt', sep='\t', skiprows=9, encoding = "latin1")

In [26]:
data.head()

Unnamed: 0,broad_id,pert_iname,qc_incompatible,purity,vendor,catalog_no,vendor_name,expected_mass,smiles,InChIKey,pubchem_cid
0,BRD-A37752546-001-01-9,(1E)-1-(2-hydroxy-5-methylphenyl)-1-dodecanone...,0,98.35,Sigma,MFCD00900589,(1E)-1-(2-hydroxy-5-methylphenyl)-1-dodecanone...,305.235,CCCCCCCCCCCC([N+][O-])c1cc(C)ccc1O,NFONIVRMILHYLH-UHFFFAOYSA-N,54108714.0
1,BRD-K89787693-001-01-1,"[sar9,met(o2)11]-substance-p",0,,Tocris,1178,2-({5-amino-2-[({1-[6-amino-2-({[1-(2-amino-5-...,1392.73,CC(C)C[C@H](NC(=O)CN(C)C(=O)[C@H](Cc1ccccc1)NC...,OUPXSLGGCPUZJJ-SARDKLJWSA-N,163829.0
2,BRD-K88956297-003-01-9,"1-((Z)-3-Chloroallyl)-1,3,5,7-tetraazaadamanta...",0,94.41,BidePharm,BD51690,"1-((Z)-3-Chloroallyl)-1,3,5,7-tetraazaadamanta...",215.106,Cl\C=C/C[N+]12CN3CN(CN(C3)C1)C2,LDLCEGCJYSDJLX-UPHRSURJSA-N,5846454.0
3,BRD-A86415025-050-01-0,"1-(1,2-Diphenylethyl)piperidine-(+/-)",0,98.04,Tocris,360,"(?)-1-(1,2-Diphenylethyl)piperidine maleate",265.183,C(C(N1CCCCC1)c1ccccc1)c1ccccc1,JQWJJJYHVHNXJH-UHFFFAOYSA-N,206666.0
4,BRD-A95802703-001-01-0,1-(2-chloro-5-methylphenoxy)-3-(isopropylamino...,0,92.38,Enamine,Z1672746675,1-(2-chloro-5-methylphenoxy)-3-(isopropylamino...,257.118,CC(C)NCC(O)COc1cc(C)ccc1Cl,NJEIOWSBPCZKTL-UHFFFAOYSA-N,20497006.0


In [27]:
cilinical_info = pd.read_csv('Input/repurposing_drugs_20170327.txt', sep='\t', skiprows=9, encoding = "latin1")

In [28]:
cilinical_info.head()

Unnamed: 0,pert_iname,clinical_phase,moa,target
0,(1E)-1-(2-hydroxy-5-methylphenyl)-1-dodecanone...,Preclinical,,
1,A-317491,Preclinical,purinergic receptor antagonist,P2RX3
2,A-33903,Phase 2,,
3,A-366,Preclinical,histone lysine methyltransferase inhibitor,EHMT1|EHMT2
4,A-674563,Preclinical,AKT inhibitor,AKT1|PKIA|PRKACA


In [29]:
cilinical_info = cilinical_info[cilinical_info['clinical_phase']=='Launched']

In [30]:
cilinical_info.head()

Unnamed: 0,pert_iname,clinical_phase,moa,target
8,abacavir,Launched,nucleoside reverse transcriptase inhibitor,
10,abamectin,Launched,benzodiazepine receptor agonist,GABBR1|GABBR2
13,abiraterone,Launched,androgen biosynthesis inhibitor,CYP11B1|CYP17A1
14,abiraterone-acetate,Launched,androgen biosynthesis inhibitor,CYP17A1
30,acamprosate,Launched,glutamate receptor antagonist,GABRA1|GABRA2|GABRA3|GABRA4|GABRA5|GABRA6|GABR...


In [31]:
cilinical_info.shape

(2350, 4)

In [32]:
data.set_index('pert_iname', inplace=True)

In [33]:
data = data.loc[cilinical_info['pert_iname'].values.tolist()]

In [34]:
data.head()

Unnamed: 0_level_0,broad_id,qc_incompatible,purity,vendor,catalog_no,vendor_name,expected_mass,smiles,InChIKey,pubchem_cid
pert_iname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
abacavir,BRD-A95032015-065-01-2,0,98.25,MicroSource,1502410,ABACAVIR SULFATE,286.154,Nc1nc(NC2CC2)c2ncn(C3C[C@H](CO)C=C3)c2n1,MCGSCOLBFJQGHM-HNHGDDPOSA-N,6328608.0
abacavir,BRD-K17443395-065-01-4,0,96.52,Selleck,S3165,Abacavir sulfate,286.154,Nc1nc(NC2CC2)c2ncn([C@@H]3C[C@H](CO)C=C3)c2n1,MCGSCOLBFJQGHM-SCZZXKLOSA-N,441300.0
abamectin,BRD-A25579302-001-04-8,0,61.0,MicroSource,1502260,ABAMECTIN (avermectin B1a shown),872.492,CCC(C)[C@H]1O[C@@]2(C[C@@H]3C[C@@H](C\C=C(C)\[...,RRZXIRBKKLTSOM-ONDMGMIHSA-N,5702208.0
abiraterone,BRD-K00111504-001-01-9,0,92.23,Selleck,S1123,Abiraterone (CB-7598),349.241,C[C@]12CC[C@@H]3[C@@H](CC=C4C[C@@H](O)CC[C@]34...,GZOSMCIZMLWJML-APXSMTNNSA-N,
abiraterone-acetate,BRD-K24048528-001-01-7,0,98.65,Selleck,S2246,Abiraterone Acetate,391.251,CC(=O)O[C@H]1CC[C@]2(C)[C@H]3CC[C@@]4(C)[C@@H]...,UVIQSJCZCSLXRZ-UBUQANBQSA-N,9821849.0


In [35]:
data.shape

(4301, 10)

In [36]:
data.dropna(subset=['pubchem_cid'], inplace=True)

In [37]:
data.head()

Unnamed: 0_level_0,broad_id,qc_incompatible,purity,vendor,catalog_no,vendor_name,expected_mass,smiles,InChIKey,pubchem_cid
pert_iname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
abacavir,BRD-A95032015-065-01-2,0,98.25,MicroSource,1502410,ABACAVIR SULFATE,286.154,Nc1nc(NC2CC2)c2ncn(C3C[C@H](CO)C=C3)c2n1,MCGSCOLBFJQGHM-HNHGDDPOSA-N,6328608.0
abacavir,BRD-K17443395-065-01-4,0,96.52,Selleck,S3165,Abacavir sulfate,286.154,Nc1nc(NC2CC2)c2ncn([C@@H]3C[C@H](CO)C=C3)c2n1,MCGSCOLBFJQGHM-SCZZXKLOSA-N,441300.0
abamectin,BRD-A25579302-001-04-8,0,61.0,MicroSource,1502260,ABAMECTIN (avermectin B1a shown),872.492,CCC(C)[C@H]1O[C@@]2(C[C@@H]3C[C@@H](C\C=C(C)\[...,RRZXIRBKKLTSOM-ONDMGMIHSA-N,5702208.0
abiraterone-acetate,BRD-K24048528-001-01-7,0,98.65,Selleck,S2246,Abiraterone Acetate,391.251,CC(=O)O[C@H]1CC[C@]2(C)[C@H]3CC[C@@]4(C)[C@@H]...,UVIQSJCZCSLXRZ-UBUQANBQSA-N,9821849.0
acamprosate,BRD-K26262077-238-01-5,1,0.0,MicroSource,1505711,ACAMPROSATE CALCIUM,181.041,CC(=O)NCCCS(O)(=O)=O,AFCGFAGUEYAMAO-UHFFFAOYSA-N,71158.0


In [38]:
data.shape

(4002, 10)

In [39]:
data = data.drop_duplicates(subset=['pubchem_cid'], keep='first')

In [40]:
data.shape

(2535, 10)

In [41]:
data.reset_index(inplace=True)

In [42]:
data.head()

Unnamed: 0,pert_iname,broad_id,qc_incompatible,purity,vendor,catalog_no,vendor_name,expected_mass,smiles,InChIKey,pubchem_cid
0,abacavir,BRD-A95032015-065-01-2,0,98.25,MicroSource,1502410,ABACAVIR SULFATE,286.154,Nc1nc(NC2CC2)c2ncn(C3C[C@H](CO)C=C3)c2n1,MCGSCOLBFJQGHM-HNHGDDPOSA-N,6328608.0
1,abacavir,BRD-K17443395-065-01-4,0,96.52,Selleck,S3165,Abacavir sulfate,286.154,Nc1nc(NC2CC2)c2ncn([C@@H]3C[C@H](CO)C=C3)c2n1,MCGSCOLBFJQGHM-SCZZXKLOSA-N,441300.0
2,abamectin,BRD-A25579302-001-04-8,0,61.0,MicroSource,1502260,ABAMECTIN (avermectin B1a shown),872.492,CCC(C)[C@H]1O[C@@]2(C[C@@H]3C[C@@H](C\C=C(C)\[...,RRZXIRBKKLTSOM-ONDMGMIHSA-N,5702208.0
3,abiraterone-acetate,BRD-K24048528-001-01-7,0,98.65,Selleck,S2246,Abiraterone Acetate,391.251,CC(=O)O[C@H]1CC[C@]2(C)[C@H]3CC[C@@]4(C)[C@@H]...,UVIQSJCZCSLXRZ-UBUQANBQSA-N,9821849.0
4,acamprosate,BRD-K26262077-238-01-5,1,0.0,MicroSource,1505711,ACAMPROSATE CALCIUM,181.041,CC(=O)NCCCS(O)(=O)=O,AFCGFAGUEYAMAO-UHFFFAOYSA-N,71158.0


### Get Molecule PubChem ID and MACCS Keys

In [43]:
drugs = []
fingerprints = []
namesList = []
synonyms = []
FailedToGetSMILE = 0
FailedToGetID = 0
MoleIsNone = 0

for i,index in enumerate(data.index):
    
    progressPercent = ((i+1)/len(data.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(data.index)))
    sys.stdout.flush()
           
    maccs = list(MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(data.loc[index, 'smiles'])).GetOnBits())
    maccs = ['MACCS_'+str(x) for x in maccs]

    name = data.loc[index, 'pert_iname']
    IDs = [int(data.loc[index, 'pubchem_cid'])]*len(maccs)
    drugs.append(IDs)
    fingerprints.append(maccs)
    namesList.append([name]*len(maccs))


drugs = [item for sublist in drugs for item in sublist]
fingerprints = [item for sublist in fingerprints for item in sublist]
namesList = [item for sublist in namesList for item in sublist]
synonyms = [item for sublist in synonyms for item in sublist]
df = pd.DataFrame(columns=['Drug ID', 'Drug Name', 'Drug Synonyms', 'Fingerprint'])
df['Drug ID'] = drugs
df['Drug Name'] = namesList
# df['Drug Synonyms'] = synonyms
df['Fingerprint'] = fingerprints


print('Number of drugs with data not found: %r' % MoleIsNone)
print('Number of drugs with SMILE data not found: %r' % FailedToGetSMILE)
print('Number of drugs with names that not matched to ids: %r' % FailedToGetID)

Number of drugs with data not found: 0
Number of drugs with SMILE data not found: 0
Number of drugs with names that not matched to ids: 0


In [44]:
df.head()

Unnamed: 0,Drug ID,Drug Name,Drug Synonyms,Fingerprint
0,6328608,abacavir,,MACCS_22
1,6328608,abacavir,,MACCS_25
2,6328608,abacavir,,MACCS_38
3,6328608,abacavir,,MACCS_53
4,6328608,abacavir,,MACCS_62


In [45]:
df.shape

(116400, 4)

In [46]:
DRH_df = df.copy()

In [53]:
lst = []

for index in DRH_df.index:
    lst.append(DRH_df.loc[index, 'Drug Name'].upper())
    
DRH_df['Drug Name'] = lst

In [54]:
DRH_df.head()

Unnamed: 0,Drug ID,Drug Name,Drug Synonyms,Fingerprint
0,6328608,ABACAVIR,,MACCS_22
1,6328608,ABACAVIR,,MACCS_25
2,6328608,ABACAVIR,,MACCS_38
3,6328608,ABACAVIR,,MACCS_53
4,6328608,ABACAVIR,,MACCS_62


In [60]:
DRH_df.shape

(116400, 4)

### Approved Drugs

In [309]:
approved_df = drugbank_df.copy()

In [310]:
didnt_do_anything = 0
not_found = 0
found_but_different_macc = 0
found_same_macc_diff_name = 0

for i,index in enumerate(DRH_df.index):
    
    progressPercent = ((i+1)/len(DRH_df.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(DRH_df.index)))
    sys.stdout.flush()
    
    if DRH_df.loc[index, 'Drug ID'] not in approved_df['Drug ID'].unique():
        not_found += 1
        approved_df = approved_df.append(DRH_df.loc[index, :], ignore_index=True)
    elif DRH_df.loc[index, 'Fingerprint'] not in approved_df[approved_df['Drug ID'] == DRH_df.loc[index, 'Drug ID']]['Fingerprint'].values.tolist():
        found_but_different_macc += 1
        approved_df = approved_df.append(DRH_df.loc[index, :], ignore_index=True)
    elif DRH_df.loc[index, 'Drug Name'] != approved_df[approved_df['Drug ID'] == DRH_df.loc[index, 'Drug ID']]['Drug Name'].tolist()[0] and DRH_df.loc[index, 'Drug Name'] != approved_df[approved_df['Drug ID'] == DRH_df.loc[index, 'Drug ID']]['Drug Synonyms'].tolist()[0].split(':'):
        found_same_macc_diff_name += 1 
        synonyms = (':').join(list(set([approved_df[approved_df['Drug ID'] == DRH_df.loc[index, 'Drug ID']]['Drug Synonyms'].tolist()[0], DRH_df.loc[index, 'Drug Name']])))
        approved_df.loc[approved_df[approved_df['Drug ID'] == DRH_df.loc[index, 'Drug ID']].index, 'Drug Synonyms'] = synonyms
    else:
        didnt_do_anything += 1

Progress: 100%  116400 Out of 116400   

In [311]:
print(didnt_do_anything)
print(not_found)
print(found_but_different_macc)
print(found_same_macc_diff_name)

37502
1260
57730
19908


In [312]:
approved_df.head()

Unnamed: 0,Drug ID,Drug Name,Drug Synonyms,Fingerprint
0,16129704,BIVALIRUDIN,BIVALIRUDINA:BIVALIRUDINUM:HIRULOG,MACCS_25
1,16129704,BIVALIRUDIN,BIVALIRUDINA:BIVALIRUDINUM:HIRULOG,MACCS_43
2,16129704,BIVALIRUDIN,BIVALIRUDINA:BIVALIRUDINUM:HIRULOG,MACCS_53
3,16129704,BIVALIRUDIN,BIVALIRUDINA:BIVALIRUDINUM:HIRULOG,MACCS_54
4,16129704,BIVALIRUDIN,BIVALIRUDINA:BIVALIRUDINUM:HIRULOG,MACCS_74


In [313]:
approved_df.shape

(163625, 4)

In [319]:
print(len(drugbank_df['Drug ID']))
print(len(DRH_df['Drug ID']))
print(len(approved_df['Drug ID']))
print(len(drugbank_df['Drug ID'])+len(DRH_df['Drug ID']))

104635
116400
163755
221035


In [330]:
didnt_do_anything = 0
not_found = 0
found_but_different_macc = 0
found_same_macc_diff_name = 0

for i,index in enumerate(drugcentral_df.index):
    
    progressPercent = ((i+1)/len(drugcentral_df.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(drugcentral_df.index)))
    sys.stdout.flush()
    
    names = drugcentral_df.loc[index, 'Drug Synonyms'].split(':')
    names.insert(0, drugcentral_df.loc[index, 'Drug Name'])
    
    if drugcentral_df.loc[index, 'Drug ID'] not in approved_df['Drug ID'].unique():
        not_found += 1
        approved_df = approved_df.append(drugcentral_df.loc[index, :], ignore_index=True)
    elif drugcentral_df.loc[index, 'Fingerprint'] not in approved_df[approved_df['Drug ID'] == drugcentral_df.loc[index, 'Drug ID']]['Fingerprint'].values.tolist():
        found_but_different_macc += 1
        approved_df = approved_df.append(drugcentral_df.loc[index, :], ignore_index=True)
    else:
        for name in names:
            if type(approved_df[approved_df['Drug ID'] == drugcentral_df.loc[index, 'Drug ID']]['Drug Synonyms'].tolist()[0]) != float:
                if name == names[-1] and name != approved_df[approved_df['Drug ID'] == drugcentral_df.loc[index, 'Drug ID']]['Drug Name'].tolist()[0] and name != drugcentral_df.loc[index, 'Drug Name'] != approved_df[approved_df['Drug ID'] == drugcentral_df.loc[index, 'Drug ID']]['Drug Synonyms'].tolist()[0].split(':'):
                    found_same_macc_diff_name += 1 
                    synonyms = (':').join(list(set(approved_df[approved_df['Drug ID'] == drugcentral_df.loc[index, 'Drug ID']]['Drug Synonyms'].tolist()[0].split(':')+names)))
                    approved_df.loc[approved_df[approved_df['Drug ID'] == drugcentral_df.loc[index, 'Drug ID']].index, 'Drug Synonyms'] = synonyms
            elif name == names[-1] and name != approved_df[approved_df['Drug ID'] == drugcentral_df.loc[index, 'Drug ID']]['Drug Name'].tolist()[0]:
                    found_same_macc_diff_name += 1
                    synonyms = (':').join(list(set(names)))
                    approved_df.loc[approved_df[approved_df['Drug ID'] == drugcentral_df.loc[index, 'Drug ID']].index, 'Drug Synonyms'] = synonyms
            else:
                didnt_do_anything += 1

Progress: 100%  388586 Out of 388586   

In [331]:
print(didnt_do_anything)
print(not_found)
print(found_but_different_macc)
print(found_same_macc_diff_name)

952
1198
56067
322595


In [332]:
drugcentral_df.head()

Unnamed: 0,Drug ID,Drug Name,Drug Synonyms,Fingerprint
0,16129665,SECREFLO,SECRETIN PORCINE:SECRETIN SYNTHETIC PORCINE,MACCS_25
1,16129665,SECREFLO,SECRETIN PORCINE:SECRETIN SYNTHETIC PORCINE,MACCS_43
2,16129665,SECREFLO,SECRETIN PORCINE:SECRETIN SYNTHETIC PORCINE,MACCS_53
3,16129665,SECREFLO,SECRETIN PORCINE:SECRETIN SYNTHETIC PORCINE,MACCS_54
4,16129665,SECREFLO,SECRETIN PORCINE:SECRETIN SYNTHETIC PORCINE,MACCS_65


In [333]:
approved_df.head()

Unnamed: 0,Drug ID,Drug Name,Drug Synonyms,Fingerprint
0,16129704,BIVALIRUDIN,ANGIOMAX:ANGIOX:HIRULOG:BIVALIRUDIN:BIVALIRUDI...,MACCS_25
1,16129704,BIVALIRUDIN,ANGIOMAX:ANGIOX:HIRULOG:BIVALIRUDIN:BIVALIRUDI...,MACCS_43
2,16129704,BIVALIRUDIN,ANGIOMAX:ANGIOX:HIRULOG:BIVALIRUDIN:BIVALIRUDI...,MACCS_53
3,16129704,BIVALIRUDIN,ANGIOMAX:ANGIOX:HIRULOG:BIVALIRUDIN:BIVALIRUDI...,MACCS_54
4,16129704,BIVALIRUDIN,ANGIOMAX:ANGIOX:HIRULOG:BIVALIRUDIN:BIVALIRUDI...,MACCS_74


In [334]:
approved_df.shape

(222356, 4)

In [337]:
approved_df.to_csv('Output/approved_df.tsv', sep='\t')

# filename = path+'approved_df_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
# approved_df.to_csv(filename, sep='\t', compression='gzip')

# Create Binary Matrix

In [344]:
matrix = uf.createBinaryMatrix(approved_df[['Fingerprint', 'Drug ID']])

Progeres: 100%  163 Out of 163   

In [345]:
matrix.head()

Unnamed: 0,1,5488645,5464070,16220172,40973,15581198,9871375,40976,57363,213013,...,9887712,16351,44629987,101826531,24550,442343,16362,16363,53239799,114682
MACCS_139,0,1,1,1,1,1,1,1,0,0,...,1,0,1,1,0,1,0,1,0,1
MACCS_25,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MACCS_50,0,1,0,0,1,0,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0
MACCS_78,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
MACCS_33,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [346]:
matrix.shape

(163, 4895)

# Save Binary Matrix to File

In [347]:
filename = path+'approved_maccs_binary_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Fingerprint List

In [348]:
fingerprint_list = pd.DataFrame(columns=['Fingerprint (MACCS KEY)'],data = matrix.index.values.tolist())

In [349]:
fingerprint_list.head()

Unnamed: 0,Fingerprint (MACCS KEY)
0,MACCS_139
1,MACCS_25
2,MACCS_50
3,MACCS_78
4,MACCS_33


# Save Fingerprint to File

In [350]:
filename = path+'approved_maccs_fingerprint_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
fingerprint_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Drug List

In [351]:
metaData = approved_df[['Drug ID', 'Drug Name', 'Drug Synonyms']]

In [352]:
metaData.drop_duplicates(subset='Drug ID', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [353]:
metaData.set_index('Drug ID', inplace=True)

In [354]:
metaData.head()

Unnamed: 0_level_0,Drug Name,Drug Synonyms
Drug ID,Unnamed: 1_level_1,Unnamed: 2_level_1
16129704,BIVALIRUDIN,ANGIOMAX:ANGIOX:HIRULOG:BIVALIRUDIN:BIVALIRUDI...
5311128,BIVALIRUDIN,HIRULOG:BIVALIRUDINA:DECAPEPTIDE I:ZOLADEX:GOS...
16130140,GRAMICIDIN D,BACILLUSBREVISGRAMICIDIND:GRAMICIDIN:GRAMICIDI...
5311065,DESMOPRESSIN,DESMOPRESSIN:DESMOPRESSIN ACETATE:MINIRINMELT:...
27991,DESMOPRESSIN,DESMOPRESSIN:DESMOPRESSIN ACETATE:MINIRINMELT:...


In [355]:
drug_list = uf.createAttributeList(matrix, metaData)

Progeres: 100%  4895 Out of 4895   

In [356]:
drug_list.head()

Unnamed: 0_level_0,Drug Name,Drug Synonyms
Attributes,Unnamed: 1_level_1,Unnamed: 2_level_1
1,ACETYLCARNITINE,ACETYL-L-CARNITINE:O-ACETYL-L-CARNITINE
5488645,LATAMOXEF,
5464070,PRALIDOXIME,PRALIDOXIME IODIDE:PRALIDOXIME:2-PAM:PRALIDOXI...
16220172,IVACAFTOR,"N-(2,4-DI-TERT-BUTYL-5-HYDROXYPHENYL)-4-OXO-1,..."
40973,DESOGESTREL,DESOGESTRELUM:ORG 2969:CERAZETTE:DESOGESTREL


In [357]:
drug_list.shape

(4895, 2)

# Save Drug List to File

In [358]:
filename = path+'approved_maccs_drug_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
drug_list.to_csv(filename, sep='\t', compression='gzip')

# Create Fingerprint Set Library

In [359]:
name = 'approved_maccs_fingerprint_set'

In [360]:
uf.createUpGeneSetLib(matrix, path, name)

Progeres: 100%  4895 Out of 4895   

# Create Drug Set Library

In [361]:
name = 'approved_maccs_drug_set'

In [362]:
uf.createUpAttributeSetLib(matrix, path, name)

Progeres: 100%  163 Out of 163   

# Create Fingerprint Similarity Matrix

In [363]:
fingerprint_similarity_matix = uf.createSimilarityMatrix(matrix, 'jaccard')

In [364]:
fingerprint_similarity_matix.head()

Unnamed: 0,MACCS_139,MACCS_25,MACCS_50,MACCS_78,MACCS_33,MACCS_90,MACCS_64,MACCS_43,MACCS_61,MACCS_99,...,MACCS_119,MACCS_96,MACCS_17,MACCS_111,MACCS_19,MACCS_146,MACCS_8,MACCS_10,MACCS_29,MACCS_80
,,,,,,,,,,,,,,,,,,,,,
MACCS_139,1.0,0.052526,0.181128,0.095363,0.026255,0.506165,0.056523,0.103471,0.055519,0.254453,...,0.107568,0.360087,0.01357,0.330629,0.064162,0.63008,0.079847,0.005587,0.047983,0.21193
MACCS_25,0.052526,1.0,0.002893,0.2656,0.045802,0.086746,0.035276,0.346221,0.039039,0.019608,...,0.226467,0.074824,0.005391,0.079669,0.007622,0.055014,0.0,0.0,0.008511,0.166782
MACCS_50,0.181128,0.002893,1.0,0.07645,0.011329,0.123065,0.061244,0.022147,0.022707,0.669391,...,0.084622,0.216068,0.043984,0.067808,0.040875,0.170114,0.140391,0.0,0.024831,0.073961
MACCS_78,0.095363,0.2656,0.07645,1.0,0.045649,0.119409,0.055215,0.266312,0.044048,0.086867,...,0.818792,0.129892,0.00722,0.123439,0.073604,0.090209,0.10089,0.001908,0.023292,0.17673
MACCS_33,0.026255,0.045802,0.011329,0.045649,1.0,0.038346,0.380313,0.071104,0.542579,0.032209,...,0.04602,0.058145,0.0,0.048808,0.020305,0.061816,0.048319,0.0,0.012165,0.07886


# Save Fingerprint Similarity Matrix

In [365]:
filename = path+'approved_maccs_fingerprint_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
fingerprint_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# HeatMap (clustergrammer) of Similarity Matrix

In [373]:
# net.load_df(fingerprint_similarity_matix.iloc[:,:].copy())
# # net.filter_N_top('row', rank_type='sum', N_top=300)
# net.cluster()
# net.widget()

# Create Drug Similarity Matrix

In [367]:
drug_similarity_matix = uf.createSimilarityMatrix(matrix.T, 'jaccard')

In [368]:
drug_similarity_matix.head()

Unnamed: 0,1,5488645,5464070,16220172,40973,15581198,9871375,40976,57363,213013,...,9887712,16351,44629987,101826531,24550,442343,16362,16363,53239799,114682
,,,,,,,,,,,,,,,,,,,,,
1.0,1.0,0.272727,0.163636,0.241935,0.137931,0.246377,0.301587,0.172414,0.209677,0.068966,...,0.196721,0.15493,0.220779,0.265823,0.0,0.255319,0.146667,0.210526,0.3,0.22973
5488645.0,0.272727,1.0,0.287356,0.488095,0.225806,0.55814,0.368421,0.247312,0.336957,0.27907,...,0.344444,0.3,0.56044,0.541667,0.023256,0.163043,0.44086,0.489362,0.463158,0.561798
5464070.0,0.163636,0.287356,1.0,0.283333,0.1,0.283582,0.108108,0.096774,0.190476,0.319149,...,0.15873,0.322581,0.27027,0.204819,0.052632,0.092593,0.283582,0.277778,0.28169,0.28169
16220172.0,0.241935,0.488095,0.283333,1.0,0.234375,0.455882,0.319444,0.265625,0.517241,0.5,...,0.34375,0.233766,0.465753,0.506667,0.0,0.2,0.375,0.438356,0.507246,0.485714
40973.0,0.137931,0.225806,0.1,0.234375,1.0,0.173333,0.555556,0.944444,0.425926,0.084746,...,0.271186,0.150685,0.230769,0.307692,0.0,0.605263,0.222222,0.27027,0.309859,0.177215


# Save Fingerprint Similarity Matrix

In [369]:
filename = path+'approved_maccs_drug_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
drug_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

In [372]:
# net.load_df(drug_similarity_matix.iloc[0:400,0:400].copy())
# # net.filter_N_top('row', rank_type='sum', N_top=300)
# net.cluster()
# net.widget()

# Create Fingerprint-Drug Edge List

In [374]:
name = 'approved_maccs_figerprint_drug_edge_list'

In [375]:
uf.createGeneAttributeEdgeList(matrix, drug_list, fingerprint_list, path, name)

Progeres: 100%  4895 Out of 4895   

 The number of statisticaly relevent gene-attribute associations is: 222121
