#### Imports

In [1]:
import os
import json
import pandas as pd
import pubchempy as pcp

#### Open dictionary and select small molecules

In [2]:
with open(os.path.join('drugbank_database_dict.json'), 'r') as file:
    drug_list = json.load(file)

In [10]:
drug_list[7]['@type']

'biotech'

In [4]:
small_drug_list = []
for drug in drug_list:
    if drug['@type'] == 'small molecule':
        small_drug_list.append(drug)

In [28]:
len(small_drug_list)

12227

In [29]:
small_drug_list[2]

{'@type': 'small molecule',
 '@created': '2005-06-13',
 '@updated': '2023-01-03',
 'drugbank-id': [{'@primary': True, '$': 'DB00014'},
  {'@primary': False, '$': 'BTD00113'},
  {'@primary': False, '$': 'BIOD00113'}],
 'name': 'Goserelin',
 'description': 'Goserelin is a synthetic hormone. In men, it stops the production of the hormone testosterone, which may stimulate the growth of cancer cells. In women, goserelin decreases the production of the hormone estradiol (which may stimulate the growth of cancer cells) to levels similar to a postmenopausal state. When the medication is stopped, hormone levels return to normal.',
 'cas-number': '65807-02-5',
 'unii': '0F65R8P09N',
 'average-mass': 1269.4105,
 'monoisotopic-mass': 1268.641439486,
 'state': 'solid',
 'groups': {'group': ['approved']},
 'general-references': {'articles': None,
  'textbooks': None,
  'links': None,
  'attachments': None},
 'synthesis-reference': 'Kripa S. Srivastava, Matthew R. Davis, "Solid Phase Peptide for the 

#### Extract properties to a new dictionary

In [30]:
small_drug_list[0]['atc-codes']['atc-code']

[{'@code': 'B01AE06',
  'level': [{'@code': 'B01AE', '$': 'Direct thrombin inhibitors'},
   {'@code': 'B01A', '$': 'ANTITHROMBOTIC AGENTS'},
   {'@code': 'B01', '$': 'ANTITHROMBOTIC AGENTS'},
   {'@code': 'B', '$': 'BLOOD AND BLOOD FORMING ORGANS'}]}]

In [31]:
for thingy in small_drug_list[0]['atc-codes']['atc-code']:
    print(thingy['@code'])


B01AE06


In [32]:
def get_inchikey(drug_dic):
    try:
        for dic in drug_dic['calculated-properties']['property']:
            if dic['kind'] == 'InChIKey':
                return dic['value']  
    except:
        return None

In [33]:
def get_h_bond_accept(drug_dic):
    try:
        for dic in drug_dic['calculated-properties']['property']:
            if dic['kind'] == 'H Bond Acceptor Count':
                return dic['value']
    except:        
        return None

In [34]:
def get_h_bond_donor(drug_dic):
    try:
        for dic in drug_dic['calculated-properties']['property']:
            if dic['kind'] == 'H Bond Donor Count':
                return dic['value']
    except:        
        return None

In [35]:
def get_mol_weight(drug_dic):
    try:
        for dic in drug_dic['calculated-properties']['property']:
            if dic['kind'] == 'Molecular Weight':
                return dic['value']
    except:        
        return None

In [36]:
def get_logp(drug_dic):
    try:
        for dic in drug_dic['calculated-properties']['property']:
            if dic['kind'] == 'logP' and dic['source'] == 'ChemAxon':
                return dic['value']
    except:        
        return None

In [37]:
def get_rule_five(drug_dic):
    try:
        for dic in drug_dic['calculated-properties']['property']:
            if dic['kind'] == 'Rule of Five':
                return dic['value']
    except:        
        return None

In [38]:
def get_isomeric_smiles(drug_dic):
    try:
        for dic in drug_dic['calculated-properties']['property']:
            if dic['kind'] == 'SMILES':
                return dic['value']
    except:        
        return None

In [39]:
def get_atc_code(drug_dic):
    try:
        for code in drug_dic['atc-codes']['atc-code']:
            return code['@code']
    except:
        return None

In [40]:
def get_name(drug_dic):
    try:
        return drug_dic['name']
    except:
        return None

In [41]:
len(small_drug_list)

12227

In [46]:
drugs_df = pd.DataFrame()

In [47]:
drugs_df['InChIKey'] = list(map(get_inchikey, small_drug_list))
drugs_df['HBondAcceptorCount'] = list(map(get_h_bond_accept, small_drug_list))
drugs_df['HBondDonorCount'] = list(map(get_h_bond_donor, small_drug_list))
drugs_df['MolecularWeight'] = list(map(get_mol_weight, small_drug_list))
drugs_df['LogP'] = list(map(get_logp, small_drug_list))
drugs_df['RuleFive'] = list(map(get_rule_five, small_drug_list))
drugs_df['IsomericSMILES'] = list(map(get_isomeric_smiles, small_drug_list))
drugs_df['ATC_Code'] = list(map(get_atc_code, small_drug_list))

In [48]:
drugs_df

Unnamed: 0,InChIKey,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,RuleFive,IsomericSMILES,ATC_Code
0,OIRCOABEOLEUMC-GEJPAHFPSA-N,37,28,2180.2853,-14,0,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,B01AE06
1,GFIJNRVAKGFPGQ-LIJARHBVSA-N,16,16,1209.3983,-2.4,0,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,L02AE51
2,BLCLNMBMMGCOAS-URPVMXJPSA-N,18,17,1269.4105,-5.1,0,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,L02AE03
3,NDAYQJDHGXTBJL-MWWSRJDJSA-N,16,20,1811.253,5.96,0,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...,R02AB30
4,NFLWUMRGJYTJIN-PNIOQBSNSA-N,15,14,1069.22,-6.1,0,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...,H01BA02
...,...,...,...,...,...,...,...,...
12222,,,,,,,,
12223,VXLAKHWYGRKCGI-UHFFFAOYSA-N,,,431.504,,,CN1CCN(CC2=CC=C(NC(=O)C3=NNC=C3NC3=C4C=CNC4=NC...,
12224,DQFCVOOFMXEPOC-UHFFFAOYSA-N,,,394.86,,,CC1=C2N=C(C3=CC=CC=C3Cl)C3=C(NC2=NN1)C=C(N=C3)...,
12225,JCCCLGDYMMTBPM-HXDHBHDHSA-N,,,850.71,,,CC[C@@]1(OC(=O)C(C)ON=C2C3=C(C4=C2C=C(C=C4[N+]...,


In [50]:
drugs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12227 entries, 0 to 12226
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   InChIKey            11586 non-null  object
 1   HBondAcceptorCount  11580 non-null  object
 2   HBondDonorCount     11580 non-null  object
 3   MolecularWeight     11586 non-null  object
 4   LogP                11570 non-null  object
 5   RuleFive            11587 non-null  object
 6   IsomericSMILES      11583 non-null  object
 7   ATC_Code            3024 non-null   object
dtypes: object(8)
memory usage: 764.3+ KB


seleccionamos los compuestos del drugbank que tienen un atc

In [52]:
drugs_atc = drugs_df[drugs_df['ATC_Code'].isna() == False]
drugs_atc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3024 entries, 0 to 12147
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   InChIKey            2940 non-null   object
 1   HBondAcceptorCount  2940 non-null   object
 2   HBondDonorCount     2940 non-null   object
 3   MolecularWeight     2940 non-null   object
 4   LogP                2936 non-null   object
 5   RuleFive            2941 non-null   object
 6   IsomericSMILES      2938 non-null   object
 7   ATC_Code            3024 non-null   object
dtypes: object(8)
memory usage: 212.6+ KB


eliminamos compuestos que no tienen SMILES, en su gran mayoría son polímeros sin estructura sencilla

In [53]:
drugs_atc_clean = drugs_atc[drugs_atc['IsomericSMILES'].isna()==False]

In [54]:
drugs_atc_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2938 entries, 0 to 12147
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   InChIKey            2938 non-null   object
 1   HBondAcceptorCount  2937 non-null   object
 2   HBondDonorCount     2937 non-null   object
 3   MolecularWeight     2938 non-null   object
 4   LogP                2934 non-null   object
 5   RuleFive            2938 non-null   object
 6   IsomericSMILES      2938 non-null   object
 7   ATC_Code            2938 non-null   object
dtypes: object(8)
memory usage: 206.6+ KB


Eliminamos los compuestos que ya están presentes en pubchem gracias a que el código ATC es único. De esta manera evitamos computar el CID de compuestos repetidos, ya que esa operación es muy costosa.

In [61]:
pubchem = pd.read_csv(os.path.join('..', 'pubchem', 'pubchem_dataset_label_clean.csv'))

In [62]:
drugs_atc_bank = drugs_atc_clean[drugs_atc_clean['ATC_Code'].isin(pubchem['ATC_Code']) == False]
drugs_atc_bank.info()

<class 'pandas.core.frame.DataFrame'>
Index: 679 entries, 1 to 12147
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   InChIKey            679 non-null    object
 1   HBondAcceptorCount  679 non-null    object
 2   HBondDonorCount     679 non-null    object
 3   MolecularWeight     679 non-null    object
 4   LogP                678 non-null    object
 5   RuleFive            679 non-null    object
 6   IsomericSMILES      679 non-null    object
 7   ATC_Code            679 non-null    object
dtypes: object(8)
memory usage: 47.7+ KB


Estos compuestos tienen un atc code que no está presente en pubchem. Analizando los datos he visto que varias moléculas muestran códigos ATC distintos en una y otra biblioteca. Para determinar si son duplicados con distinto atc o moléculas no presentes en pubchem, obtengo el CID de estos compuestos a partir de su smiles.

In [74]:
def get_cid_from_smiles(smiles):
    try:
        mol = pcp.get_compounds(smiles, 'smiles')
        return int(mol[0].cid)
    except:
        print(f'no cid for smiles {smiles}')
        return None

In [75]:
get_cid_from_smiles(drugs_atc_bank.iloc[0]['IsomericSMILES'])

657181

Creo una columna cid en el dataframe de drugbank. Este es un paso que lleva mucho tiempo, por eso espero a tener el dataset más pequeño antes de hacerlo.

In [76]:
drugs_atc_bank['CID'] = drugs_atc_bank['IsomericSMILES'].map(get_cid_from_smiles)

no cid for smiles [H][C@]12C[C@]1(NC(=O)[C@]1([H])C[C@H](C[C@@]1([H])C(=O)N(C)CCCC\C=C/2)OC1=CC(=NC2=C1C=CC(OC)=C2C)C1=NC(=CS1)C(C)C)C(=O)NS(=O)(=O)C1CC1
no cid for smiles [Mg++].[O-][O-]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drugs_atc_bank['CID'] = drugs_atc_bank['IsomericSMILES'].map(get_cid_from_smiles)


In [77]:
drugs_atc_bank

Unnamed: 0,InChIKey,HBondAcceptorCount,HBondDonorCount,MolecularWeight,LogP,RuleFive,IsomericSMILES,ATC_Code,CID
1,GFIJNRVAKGFPGQ-LIJARHBVSA-N,16,16,1209.3983,-2.4,0,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,L02AE51,657181.0
8,PMATZTZNYRCHOR-CGLBZJNRSA-N,12,5,1202.635,3.64,0,CC[C@@H]1NC(=O)[C@H]([C@H](O)[C@H](C)C\C=C\C)N...,L04AD01,5284373.0
19,KDXKERNSBIXSRK-YFKPBYRVSA-N,4,3,146.1876,-3.2,1,NCCCC[C@H](N)C(O)=O,V03AF11,5962.0
20,ODKSFYDXXFIFQN-BYPYZUCNSA-N,6,5,174.201,-3.2,1,N[C@@H](CCCNC(N)=N)C(O)=O,V03AF11,6322.0
21,CIWBSHSKHKDKBQ-JLAZNSOCSA-N,5,4,176.1241,-1.9,1,[H][C@@]1(OC(=O)C(O)=C1O)[C@@H](O)CO,G01AD03,54670067.0
...,...,...,...,...,...,...,...,...,...
11717,XAYGBKHKBBXDAK-UHFFFAOYSA-N,5,1,450.56,1.62,1,O=C(N1CCN(CC2CC2)CC1)C1=CC=C(NS(=O)(=O)C2=CC=C...,G01AE10,59634741.0
11728,VEVMYTDOWUQLGI-UHFFFAOYSA-N,4,3,409.96,5.22,0,CC(C)(C)NCC1=CC(NC2=CC=NC3=CC(Cl)=CC=C23)=C2CC...,P01BF08,9851775.0
11851,GBECUEIQVRDUKB-RYDPDVNUSA-M,0,0,236.42,0.53,1,Cl[201Tl],V09GX01,16019977.0
11942,WUWFMDMBOJLQIV-UHFFFAOYSA-N,7,2,404.349,0.47,1,NC1CCN(C1)C1=NC2=C(C=C1F)C(=O)C(=CN2C1=CC=C(F)...,J01MA22,5517.0


In [87]:
drugs_atc_bank.info()

<class 'pandas.core.frame.DataFrame'>
Index: 676 entries, 1 to 12147
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   InChIKey            676 non-null    object
 1   HBondAcceptorCount  676 non-null    object
 2   HBondDonorCount     676 non-null    object
 3   MolecularWeight     676 non-null    object
 4   LogP                676 non-null    object
 5   RuleFive            676 non-null    object
 6   IsomericSMILES      676 non-null    object
 7   ATC_Code            676 non-null    object
 8   CID                 676 non-null    int32 
dtypes: int32(1), object(8)
memory usage: 50.2+ KB


Drop two compounds without CID, one is actualy present in pubchem's dataset and the other is just magnesium peroxide

In [84]:
drugs_atc_bank = drugs_atc_bank[drugs_atc_bank['CID'].isna() == False]

CID are saved as floats instead of integers for some reason, so we need to transform the values to integers

In [86]:
drugs_atc_bank['CID'] = drugs_atc_bank['CID'].astype('int')

In [85]:
drugs_atc_bank.info()

<class 'pandas.core.frame.DataFrame'>
Index: 676 entries, 1 to 12147
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   InChIKey            676 non-null    object 
 1   HBondAcceptorCount  676 non-null    object 
 2   HBondDonorCount     676 non-null    object 
 3   MolecularWeight     676 non-null    object 
 4   LogP                676 non-null    object 
 5   RuleFive            676 non-null    object 
 6   IsomericSMILES      676 non-null    object 
 7   ATC_Code            676 non-null    object 
 8   CID                 676 non-null    float64
dtypes: float64(1), object(8)
memory usage: 52.8+ KB
