# ChEMBL Dataset

In [46]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors
import kagglehub
import os
import shutil

## Kaggle API

### ChEMBL Datasets in Kaggle

In [13]:
!kaggle datasets list -s 'chembl'

ref                                                                title                                                 size  lastUpdated                 downloadCount  voteCount  usabilityRating  
-----------------------------------------------------------------  ----------------------------------------------  ----------  --------------------------  -------------  ---------  ---------------  
bigquery/ebi-chembl                                                ChEMBL EBI Small Molecules Database                      0  2019-02-12 00:36:10.450000              0        100  0.64705884       
art3mis/chembl22                                                   Drug Design with Small Molecule SMILES            26097905  2022-02-26 15:21:54.670000           1358         32  1.0              
gauravan/human-acetylcholinesterase-dataset-from-chembl            Human acetylcholinesterase Dataset from ChEMBL      762208  2022-06-14 03:25:24.290000            154         25  0.7647059        
xiaot

### We will use art3mis/chembl22 dataset

In [None]:
# Download latest version
path = kagglehub.dataset_download("art3mis/chembl22")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/art3mis/chembl22?dataset_version_number=2...


100%|██████████| 24.9M/24.9M [00:02<00:00, 9.60MB/s]

Extracting files...





Path to dataset files: C:\Users\ary-r\.cache\kagglehub\datasets\art3mis\chembl22\versions\2


In [None]:
source_dir = path  # Ex: "C:\\Users\\ary-r\\.cache\\kagglehub\\datasets\\art3mis\\chembl22\\versions\\2"

# Dir of destiny
target_dir = os.path.join(os.getcwd(), "dataset")

# If the folder exists, create a new one
if os.path.exists(target_dir):
    shutil.rmtree(target_dir)

# Copy the dataset to destiny folder
shutil.copytree(source_dir, target_dir)
print(f"Dataset copiado para: {target_dir}")


Dataset copiado para: c:\Users\ary-r\Documents\Programação\Data Science\Kaggle\dataset


## SMILES to csv Sorted by SMILES lenght

In [None]:
# Read the .smi file using tab as the delimiter.
# Since the file has no header, we specify header=None and assign column names.
df = pd.read_csv("dataset/chembl_22_clean_1576904_sorted_std_final.smi", delimiter="\t", header=None, names=["SMILES", "ChEBML_ID"])

# Reorder columns so that the first column is ChEBML_ID and the second is SMILES.
df = df[["ChEMBL_ID", "SMILES"]]

# Sort the DataFrame by the length of the SMILES strings.
df = df.sort_values(by="SMILES", key=lambda x: x.str.len())

# Save the DataFrame to a CSV file without including the index.
df.to_csv("chembl22.csv", index=False)

df.head()

Unnamed: 0,ChEBML_ID,SMILES
1,CHEMBL17564,C
2,CHEMBL14688,CO
5,CHEMBL43280,CN
11,CHEMBL116838,CF
12,CHEMBL135626,CC


## Clean the dataset (NaN and Duplicates)

In [48]:
df.isnull().sum()

ChEBML_ID    0
SMILES       0
dtype: int64

In [49]:
df['ChEBML_ID'].duplicated().sum()

np.int64(0)

In [50]:
df['SMILES'].duplicated().sum()

np.int64(73232)

In [51]:
df = df.drop_duplicates(subset="SMILES")

In [52]:
df

Unnamed: 0,ChEBML_ID,SMILES
1,CHEMBL17564,C
2,CHEMBL14688,CO
5,CHEMBL43280,CN
11,CHEMBL116838,CF
12,CHEMBL135626,CC
...,...,...
1576898,CHEMBL1631334,CN(C)P(=O)(OCC1CN(CC(O1)n1cnc2c1NC(N)=NC2=O)P(...
1576899,CHEMBL1077161,CC1=CN(C2CC(OP(O)(=O)OCC3OC(C(O)C3OP(O)(=O)OCC...
1576901,CHEMBL1077165,n1(cnc2c1N=C(N)NC2=O)C1OC(COP(O)(=O)OC2C(COP(O...
1576902,CHEMBL1077164,CC1=CN(C2CC(OP(O)(=O)OCC3OC(C(O)C3OP(O)(=O)OCC...


In [53]:
df['SMILES'].duplicated().sum()

np.int64(0)

## Function to compute descriptors from a SMILES string

In [54]:
def compute_descriptors(smiles):
    # Convert SMILES to an RDKit Mol object
    mol = Chem.MolFromSmiles(smiles)
    # If conversion fails, return None for all descriptors
    if mol is None:
        return pd.Series([None, None, None, None, None, None],
                         index=["MW", "log_p", "H_donors", "H_acceptors", "NHA", "TPSA"])
    
    # Compute descriptors
    mw = Descriptors.ExactMolWt(mol)             # Molecular Weight
    log_p = Descriptors.MolLogP(mol)               # Log P
    h_donors = Descriptors.NumHDonors(mol)         # Number of Hydrogen Donors
    h_acceptors = Descriptors.NumHAcceptors(mol)   # Number of Hydrogen Acceptors
    nha = rdMolDescriptors.CalcNumHeavyAtoms(mol)  # Number of Heavy Atoms
    tpsa = Descriptors.TPSA(mol)                   # Topological Polar Surface Area
    
    return pd.Series([mw, log_p, h_donors, h_acceptors, nha, tpsa],
                     index=["MW", "log_p", "H_donors", "H_acceptors", "NHA", "TPSA"])

In [55]:
# Apply the descriptor computation function and concatenate the descriptors to the DataFrame.
df_descriptors = df["SMILES"].apply(compute_descriptors)
df = pd.concat([df, df_descriptors], axis=1)

[15:42:41] Explicit valence for atom # 1 N, 6, is greater than permitted
[15:42:41] Explicit valence for atom # 3 O, 3, is greater than permitted
[15:42:41] Explicit valence for atom # 3 O, 3, is greater than permitted
[15:42:42] Explicit valence for atom # 3 O, 3, is greater than permitted
[15:42:42] Explicit valence for atom # 4 O, 3, is greater than permitted
[15:42:42] Explicit valence for atom # 4 O, 3, is greater than permitted
[15:42:42] Explicit valence for atom # 4 O, 3, is greater than permitted
[15:42:42] Explicit valence for atom # 3 O, 3, is greater than permitted
[15:42:42] Explicit valence for atom # 5 O, 3, is greater than permitted
[15:42:42] Explicit valence for atom # 4 O, 3, is greater than permitted
[15:42:43] Explicit valence for atom # 5 O, 3, is greater than permitted
[15:42:43] Explicit valence for atom # 7 O, 3, is greater than permitted
[15:42:43] Explicit valence for atom # 5 O, 3, is greater than permitted
[15:42:44] Explicit valence for atom # 5 O, 3, is g

## Lipinski Rule of 5

In [56]:
# Function to check Lipinski's criteria for each row
def lipinski_pass(row):
    count = 0
    # Check if the compound meets each criterion
    if row["MW"] is not None and row["MW"] <= 500:
        count += 1
    if row["log_p"] is not None and row["log_p"] <= 5:
        count += 1
    if row["H_donors"] is not None and row["H_donors"] <= 5:
        count += 1
    if row["H_acceptors"] is not None and row["H_acceptors"] <= 10:
        count += 1
    # Return 1 if at least three criteria are met, else 0
    return 1 if count >= 3 else 0

In [57]:
# Apply the function to each row and create a new column 'Lipinski'
df["Lipinski"] = df.apply(lipinski_pass, axis=1)

# Display the first few rows to verify the new column
df.head()

Unnamed: 0,ChEBML_ID,SMILES,MW,log_p,H_donors,H_acceptors,NHA,TPSA,Lipinski
1,CHEMBL17564,C,16.0313,0.6361,0.0,0.0,1.0,0.0,1
2,CHEMBL14688,CO,32.026215,-0.3915,1.0,1.0,2.0,20.23,1
5,CHEMBL43280,CN,31.042199,-0.4251,1.0,1.0,2.0,26.02,1
11,CHEMBL116838,CF,34.021878,0.5857,0.0,0.0,2.0,0.0,1
12,CHEMBL135626,CC,30.04695,1.0262,0.0,0.0,2.0,0.0,1


In [58]:
df.isnull().sum()

ChEBML_ID         0
SMILES            0
MW             1133
log_p          1133
H_donors       1133
H_acceptors    1133
NHA            1133
TPSA           1133
Lipinski          0
dtype: int64

In [59]:
df = df.dropna()

In [60]:
df.isnull().sum()

ChEBML_ID      0
SMILES         0
MW             0
log_p          0
H_donors       0
H_acceptors    0
NHA            0
TPSA           0
Lipinski       0
dtype: int64

In [62]:
df.to_csv("chembl22_lipinski.csv", index=False)