### This notebook is a prototype of the architecture behind the app Drug Predictor

#### Imports and loads

In [1]:
import os
import json
import random
import numpy as np
import pandas as pd
import pubchempy as pcp
from rdkit import Chem
from compute_fp_note import Compute_FP
from tensorflow.keras.models import load_model

Obtaining a random dataset

In [43]:
list_of_cids = [random.randrange(17000000) for x in range(10)]
print(list_of_cids)

[10252613, 11312089, 3626475, 8160293, 747765, 1284872, 6343964, 11259732, 3587638, 2169831]


In [44]:
cids = pd.DataFrame(list_of_cids, columns = ['cid'])

Alternatively, load dataset from file

In [30]:
cids = pd.read_csv(os.path.join('..', 'data', '09_examples', 'ht_input_smiles.csv'))

Loading model, selected fingerprints and code dictionary

In [45]:
with open(os.path.join('..', 'data', '05_model_input', 'selected_fp.txt')) as file:
    selected_fp = file.readline()

In [46]:
model = load_model(os.path.join('..', 'data', '06_models', 'def_model.hd5'))



In [47]:
with open(os.path.join('..', 'data', '03_primary', 'code_to_label_dic.json'), 'r') as file:
            class_codes_dict = json.load(file)

### Functions

In [48]:
def get_smiles(cid: int):
    """
    Function that obtains the SMILES of a molecule from its cid.
    Args: cid number integer.
    Output: SMILES string.
    """
    try:
        compound = pcp.Compound.from_cid(cid)
        return compound.isomeric_smiles
    except:
        print('no smiles found')
        return None

In [49]:
def get_rdkit_molecule(smiles: str):
    """
    Function that obtains the RDKit molecule object of molecule from its SMILES.
    Args: SMILES string.
    Output: RDKit molecule object.
    """
    try:
        mol = Chem.MolFromSmiles(smiles)
        return mol
    except:
        print('no molecule found')
        return None

In [50]:
computer = Compute_FP()
def get_fp(mol):
    """
    Function that obtains fingerprints from an RDKit molecule object. It uses the function relate_fp_functions from the module compute_fp.
    This function takes the kind of fingerprints that need to be retrieved and chooses the correct function to obtain them.
    Arg: an RDKit molecule object.
    Output: a numpy array
    """
    return computer.relate_fp_functions(selected_fp, mol)

### Obtaining molecule fingerprints
A dataframe is build with SMILES, RDKit molecule object and fingerprints

In [51]:
if 'cid' in cids.columns:
    cids['smiles'] = cids['cid'].map(get_smiles)
cids['molecule'] = cids['smiles'].map(get_rdkit_molecule)
cids['fingerprints'] = cids['molecule'].map(get_fp)

In [52]:
cids

Unnamed: 0,cid,smiles,molecule,fingerprints
0,10252613,CCC[C@](C1=CC2=C(C=C1)C=C(C=C2)OC)(C(=O)O)C3(C...,<rdkit.Chem.rdchem.Mol object at 0x000002270F2...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,11312089,COC1=CC=C(C=C1)N/N=C/2\CCC3=C2C=C(C=C3)OC,<rdkit.Chem.rdchem.Mol object at 0x000002270F2...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,3626475,CCCCCCCCCNC(=O)C1=CC2=CC=CC=C2C(=O)O1,<rdkit.Chem.rdchem.Mol object at 0x000002270F2...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
3,8160293,C#CCNC(=O)CSC1=NN=C(N1CC2=CC=CC=C2)CN3CCCCC3,<rdkit.Chem.rdchem.Mol object at 0x000002270F2...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,747765,CC[C@@H]1[C@H](OC(=O)C(C1=O)(C)C)C2=CC(=CC=C2)...,<rdkit.Chem.rdchem.Mol object at 0x000002270F2...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,1284872,CCOC1=C(C(=CC(=C1)C(=S)N2C[C@@H](O[C@H](C2)C)C...,<rdkit.Chem.rdchem.Mol object at 0x000002270F2...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
6,6343964,C1=CC=C(C(=C1)C(=O)O[NH+]=C(C2=NON=C2N)N)I,<rdkit.Chem.rdchem.Mol object at 0x000002270F2...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7,11259732,C1=CC=C(C=C1)C[N+]2=CC=CC(=C2)C(=O)NCCN(CCCl)CCCl,<rdkit.Chem.rdchem.Mol object at 0x000002270F2...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
8,3587638,CCCN(CCC)S(=O)(=O)C1=CC=C(C=C1)C(=O)NC2=CC=CC3...,<rdkit.Chem.rdchem.Mol object at 0x000002270F2...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
9,2169831,COCCCNC(=O)/C=C/C1=C(C=CC=C1Cl)Cl,<rdkit.Chem.rdchem.Mol object at 0x000002270F2...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."


### Obtaining predictions

In [53]:
fingerprints = np.array(list(cids['fingerprints']))
reshaped_fps = fingerprints.reshape((fingerprints.shape[0], fingerprints.shape[1], 1))

In [54]:
probs = model.predict(reshaped_fps)
preds = [np.argmax(x) for x in probs]
max_probs = [np.max(x) for x in probs]



### Building output dataset
Columns predictions, probability, description, 'url' and 'cid' (if not provided) are added

In [38]:
cids['prediction'] = preds
cids['probability'] = max_probs

In [39]:
cids['description'] = cids['prediction'].astype('str').map(class_codes_dict)

In [55]:
if 'cid' not in cids.columns:
    cids['cid'] = cids['smiles'].apply(lambda x: pcp.get_compounds(x, 'smiles')[0].cid)
    cids['cid'] = cids['cid'].replace({np.nan: None})
    cids['cid'] = cids['cid'].apply(lambda x: int(x) if x!=None else 0)

In [22]:
cids['url'] = cids['cid'].apply(
    lambda x: f'https://pubchem.ncbi.nlm.nih.gov/compound/{x}' if x!=0 else None)

In [23]:
output_df = cids[['cid', 'prediction', 'probability', 'description', 'url']]

In [24]:
output_df

Unnamed: 0,cid,prediction,probability,description,url
0,10529429,0,0.667142,Alimentary tract and metabolism,https://pubchem.ncbi.nlm.nih.gov/compound/1052...
1,6460467,10,0.797661,Nervous system,https://pubchem.ncbi.nlm.nih.gov/compound/6460467
2,11594294,10,0.462228,Nervous system,https://pubchem.ncbi.nlm.nih.gov/compound/1159...
3,6306287,10,0.998569,Nervous system,https://pubchem.ncbi.nlm.nih.gov/compound/6306287
4,12260926,10,0.657525,Nervous system,https://pubchem.ncbi.nlm.nih.gov/compound/1226...
5,5730620,10,0.590146,Nervous system,https://pubchem.ncbi.nlm.nih.gov/compound/5730620
6,1357894,10,0.567083,Nervous system,https://pubchem.ncbi.nlm.nih.gov/compound/1357894
7,3209631,10,0.865813,Nervous system,https://pubchem.ncbi.nlm.nih.gov/compound/3209631
8,4490953,10,0.469757,Nervous system,https://pubchem.ncbi.nlm.nih.gov/compound/4490953
9,12845338,10,0.945368,Nervous system,https://pubchem.ncbi.nlm.nih.gov/compound/1284...
