In [1]:
! pip install chembl_webresource_client



In [2]:
! pip install rdkit



In [3]:
from chembl_webresource_client.new_client import new_client
import pandas as pd
import numpy as np
import sys
sys.path.append('/usr/local/lib/python3.10.5/site-packages/')

from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

In [4]:
tg = input("target of interest : ")
org = input("organism of interest : ")

In [5]:
# Target search for coronavirus
target = new_client.target
target_query = target.search(tg)
targets = pd.DataFrame.from_dict(target_query)
selected_target = targets.target_chembl_id[3]
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type='IC50')
df = pd.DataFrame.from_dict(res)

In [6]:
df2 = df[df.standard_value.notna() & df.canonical_smiles.notna()]
df2_nr = df2.drop_duplicates(['canonical_smiles'])

In [7]:
selection = ['molecule_chembl_id','canonical_smiles','standard_value' ]
df3 = df2_nr[selection]

In [8]:
df3.to_csv(tg+'_02_bioactivity_data_preprocessed.csv', index=False)

In [9]:
df4 = pd.read_csv(tg+'_02_bioactivity_data_preprocessed.csv')

In [10]:
bioactivity_threshold = []
for i in df4.standard_value:
  if float(i) >= 10000:
    bioactivity_threshold.append("inactive")
  elif float(i) <= 1000:
    bioactivity_threshold.append("active")
  else:
    bioactivity_threshold.append("intermediate")

mol_cid = []
for i in df4.molecule_chembl_id:
  mol_cid.append(i)

canonical_smiles = []
for i in df4.canonical_smiles:
  canonical_smiles.append(i)
  
standard_value = []
for i in df4.standard_value:
  standard_value.append(i)

In [11]:
selection = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']
df4[selection]

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL269775,CC(CN1CCCCC1)OC(=O)[C@@H]1CCCN1C(=O)C(=O)C(C)(C)C,10000.000
1,CHEMBL6683,CC(CN1CCCCC1)OC(=O)[C@@H]1CCCN1C(=O)C(=O)c1ccccc1,10000.000
2,CHEMBL7237,CC(C)CC(=O)C(=O)N1CCC[C@H]1C(=O)OCCCc1cccnc1,10000.000
3,CHEMBL269732,C=CC[C@@H]1/C=C(\C)C[C@H](C)C[C@H](OC)[C@H]2O[...,3.162
4,CHEMBL266660,CC(C)CC(=O)C(=O)N1CCC[C@H]1C(=O)OCCS(=O)(=O)c1...,10000.000
...,...,...,...
4271,CHEMBL4778112,O=C(c1ccc2nc(Nc3ccc(-c4nc(N5CCOCC5)nc(N5CCOCC5...,12900.000
4272,CHEMBL4788677,CN1CCN(C(=O)c2ccc3nc(Nc4ccc(-c5nc(N6CCOCC6)nc(...,13100.000
4273,CHEMBL4782217,CS(=O)(=O)N1CCN(C(=O)c2ccc3nc(Nc4ccc(-c5nc(N6C...,11900.000
4274,CHEMBL4778200,CN(C)C1CCN(C(=O)c2cccc(NC(=O)Nc3ccc(-c4nc(N5CC...,2100.000


In [12]:
df4.to_csv(tg + '_03_bioactivity_data_preprocessed.csv', index=False)

In [13]:
df5 = pd.read_csv('mTOR_03_bioactivity_data_preprocessed.csv')

In [14]:
bioactivity_class = pd.Series(bioactivity_threshold, name='class')
df5 = pd.concat([df4, bioactivity_class], axis=1)

In [15]:
df_no_smiles = df5.drop(columns='canonical_smiles')

In [16]:
smiles = []

for i in df5.canonical_smiles.tolist():
  cpd = str(i).split('.')
  cpd_longest = max(cpd, key = len)
  smiles.append(cpd_longest)

In [17]:
# Inspired by: https://codeocean.com/explore/capsules?query=tag:data-curation

def lipinski(smiles, verbose=False):

    moldata= []
    for elem in smiles:
        mol=Chem.MolFromSmiles(elem) 
        moldata.append(mol)
       
    baseData= np.arange(1,1)
    i=0  
    for mol in moldata:        
       
        desc_MolWt = Descriptors.MolWt(mol)
        desc_MolLogP = Descriptors.MolLogP(mol)
        desc_NumHDonors = Lipinski.NumHDonors(mol)
        desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)
           
        row = np.array([desc_MolWt,
                        desc_MolLogP,
                        desc_NumHDonors,
                        desc_NumHAcceptors])   
    
        if(i==0):
            baseData=row
        else:
            baseData=np.vstack([baseData, row])
        i=i+1      
    
    columnNames=["MW","LogP","NumHDonors","NumHAcceptors"]   
    descriptors = pd.DataFrame(data=baseData,columns=columnNames)
    
    return descriptors

In [18]:
def pIC50(input):
    pIC50 = []

    for i in input['standard_value_norm']:
        molar = i*(10**-9) # Converts nM to M
        pIC50.append(-np.log10(molar))

    input['pIC50'] = pIC50
    x = input.drop('standard_value_norm', axis=1)
        
    return x

def norm_value(input):
    norm = []

    for i in input['standard_value']:
        if i > 100000000:
          i = 100000000
        norm.append(i)

    input['standard_value_norm'] = norm
    x = input.drop('standard_value', axis=1)
        
    return x

In [19]:
smiles = pd.Series(smiles, name = 'canonical_smiles')
df_clean_smiles = pd.concat([df_no_smiles,smiles], axis=1)
df_lipinski = lipinski(df_clean_smiles.canonical_smiles)

In [20]:
df_combined = pd.concat([df5,df_lipinski], axis=1)
df_norm = norm_value(df_combined)
df_final = pIC50(df_norm)

In [25]:
!curl -O https://github.com/dataprofessor/bioinformatics/raw/master/padel.zip
!curl -O https://github.com/dataprofessor/bioinformatics/raw/master/padel.sh

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0


In [22]:
selection = ['canonical_smiles','molecule_chembl_id']
df3_selection = df_final[selection]
df3_selection.to_csv('molecule.smi', sep='\t', index=False, header=False)

In [26]:
! cat padel.sh

'cat' is not recognized as an internal or external command,
operable program or batch file.


In [24]:
! bash padel.sh

'bash' is not recognized as an internal or external command,
operable program or batch file.
