In [1]:
!git clone https://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@github.com/MinhHieu-Nguyen-dn/diabetes_active_proteins.git

Cloning into 'diabetes_active_proteins'...
remote: Enumerating objects: 28, done.[K
remote: Counting objects: 100% (28/28), done.[K
remote: Compressing objects: 100% (22/22), done.[K
remote: Total 28 (delta 4), reused 25 (delta 2), pack-reused 0[K
Receiving objects: 100% (28/28), 47.62 KiB | 4.33 MiB/s, done.
Resolving deltas: 100% (4/4), done.


In [6]:
!pip install -q rdkit

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.3/34.3 MB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
os.chdir('diabetes_active_proteins')

In [3]:
os.listdir()

['.gitignore',
 'data',
 'README.md',
 'requirements.txt',
 'get_data.py',
 '.git',
 'get_data_notebook.ipynb']

In [20]:
import math
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import MACCSkeys, rdFingerprintGenerator

In [4]:
def convert_ic50_to_pic50(IC50_value):
    pIC50_value = 9 - math.log10(IC50_value)
    return pIC50_value

In [14]:
def smiles_to_fp(smiles, method="maccs", n_bits=2048):
    """
    Encode a molecule from a SMILES string into a fingerprint.

    Parameters
    ----------
    smiles : str
        The SMILES string defining the molecule.

    method : str
        The type of fingerprint to use. Default is MACCS keys.

    n_bits : int
        The length of the fingerprint.

    Returns
    -------
    array
        The fingerprint array.

    """

    # convert smiles to RDKit mol object
    mol = Chem.MolFromSmiles(smiles)

    if method == "maccs":
        return np.array(MACCSkeys.GenMACCSKeys(mol))
    if method == "morgan2":
        fpg = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=n_bits)
        return np.array(fpg.GetFingerprint(mol))
    if method == "morgan3":
        fpg = rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=n_bits)
        return np.array(fpg.GetFingerprint(mol))
    else:
        # NBVAL_CHECK_OUTPUT
        print(f"Warning: Wrong method specified: {method}. Default will be used instead.")
        return np.array(MACCSkeys.GenMACCSKeys(mol))

In [None]:
uniprot_id = 'P15121'
file_path = os.path.join('data', '{}.csv'.format(uniprot_id))
df = pd.read_csv(file_path, index_col=0)

In [None]:
# Add new column for fingerprints
df["fp"] = df["smiles"].apply(smiles_to_fp)

In [None]:
# Apply conversion IC50 to pIC50 to each row of the compounds DataFrame
df["pIC50"] = df.apply(lambda x: convert_ic50_to_pic50(x.IC50), axis=1)

In [22]:
# Create active column
df["active"] = df["pIC50"].apply(lambda x: 1 if x > 6 else 0)

In [23]:
df

Unnamed: 0,molecule_chembl_id,IC50,units,smiles,fp,pIC50,active
0,CHEMBL18854,230.0,nM,CCCCCCS(=O)(=O)c1ccc(Cl)cc1C1NC(=O)NC1=O,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6.638272,1
1,CHEMBL19744,130.0,nM,CNS(=O)(=O)c1ccc(Cl)cc1C1NC(=O)NC1=O,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6.886057,1
2,CHEMBL19711,100.0,nM,O=C1NC(=O)C(c2cc(Cl)ccc2S(=O)(=O)NCCCc2ccccc2)N1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",7.000000,1
3,CHEMBL19392,5630.0,nM,O=C1NC(=O)C(c2cc(Cl)ccc2S(=O)(=O)NCCCCc2ccccc2)N1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5.249492,0
4,CHEMBL19746,7470.0,nM,COc1ccc(F)cc1C1NC(=O)NC1=O,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5.126679,0
...,...,...,...,...,...,...,...
779,CHEMBL4761144,61700.0,nM,CC(C)C[C@H](NC(=O)c1cnc2c(=O)[nH]c(N)nc2n1)C(=O)O,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4.209715,0
780,CHEMBL4757375,83400.0,nM,CC(C)[C@H](NC(=O)c1cnc2c(=O)[nH]c(N)nc2n1)C(=O)O,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4.078834,0
781,CHEMBL4744462,12400.0,nM,C[C@H](NC(=O)c1cnc2c(=O)[nH]c(N)nc2n1)C(=O)O,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4.906578,0
782,CHEMBL4777280,1970.0,nM,Nc1nc2nc(C(=O)NCC(=O)O)cnc2c(=O)[nH]1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5.705534,0
