In [3]:
!pip install rdkit pandas scikit-learn numpy

Collecting rdkit
  Downloading rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl (34.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m55.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.3.3


In [4]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
data = pd.read_csv("/content/drive/MyDrive/broad_hackathon/data/HDAC_data_processed.csv")

In [17]:
data

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL323869,O=C(CCCCCC(NC(=O)OCc1ccccc1)C(=O)Nc1cccc2cccnc...,2.5
1,CHEMBL327146,O=C(CCCCCC(C(=O)Nc1ccc2ncccc2c1)C(=O)Nc1ccc2nc...,1.0
2,CHEMBL116620,O=C(/C=C/c1cccc(C(C(=O)Nc2ccccc2)C(=O)Nc2ccccc...,1.0
3,CHEMBL98,O=C(CCCCCCC(=O)Nc1ccccc1)NO,200.0
4,CHEMBL346414,Cc1ccc(NS(=O)(=O)c2ccc(/C=C/C(=O)Nc3ccccc3N)cc...,2000.0
...,...,...,...
8877,CHEMBL5432574,O=C(CCCCCCNC(=O)c1ccc(NC(=O)CC23C[C@H]4C[C@@H]...,192.3
8878,CHEMBL5436873,O=C(CCCCCCNC(=O)c1ccc(NC(=O)CNC(=O)CC23C[C@H]4...,132.6
8879,CHEMBL5434895,O=C(CCCCCCNC(=O)c1ccc(NC(=O)CNC(=O)CC2c3ccccc3...,86.4
8880,CHEMBL5414971,O=C(CCCCCCNC(=O)c1ccc(-c2cn(CCCCCCOCCOCCNC(=O)...,349.9


In [7]:
selection = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']

data = data[selection]
data.head()


Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL323869,O=C(CCCCCC(NC(=O)OCc1ccccc1)C(=O)Nc1cccc2cccnc...,2.5
1,CHEMBL327146,O=C(CCCCCC(C(=O)Nc1ccc2ncccc2c1)C(=O)Nc1ccc2nc...,1.0
2,CHEMBL116620,O=C(/C=C/c1cccc(C(C(=O)Nc2ccccc2)C(=O)Nc2ccccc...,1.0
3,CHEMBL98,O=C(CCCCCCC(=O)Nc1ccccc1)NO,200.0
4,CHEMBL346414,Cc1ccc(NS(=O)(=O)c2ccc(/C=C/C(=O)Nc3ccccc3N)cc...,2000.0


In [18]:
def lipinski(smiles, verbose=False):
    moldata= []

    for elem in smiles:
        mol=Chem.MolFromSmiles(elem)
        moldata.append(mol)

    baseData=np.arange(1,1)
    i=0
    for mol in moldata:

        desc_MolWt = Descriptors.MolWt(mol)
        desc_MolLogP = Descriptors.MolLogP(mol)
        desc_NumHDonors = Lipinski.NumHDonors(mol)
        desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)

        row = np.array([ desc_MolWt,
                        desc_MolLogP,
                        desc_NumHDonors,
                        desc_NumHAcceptors])

        if(i==0):
            baseData=row
        else:
            baseData=np.vstack([baseData, row])
        i=i+1

    columnNames=["MW", "LogP", "NumHDonors", "NumHAcceptors"]
    descriptors = pd.DataFrame(data=baseData, columns=columnNames)

    return descriptors

In [19]:
data['canonical_smiles'] = data['canonical_smiles'].astype(str)
data = data[data['canonical_smiles'].notna() & (data['canonical_smiles'] != 'nan')]
data.shape

(8874, 3)

In [20]:
lipinski_df = lipinski(data.canonical_smiles)
lipinski_df

Unnamed: 0,MW,LogP,NumHDonors,NumHAcceptors
0,464.522,3.92420,4.0,6.0
1,485.544,4.43230,4.0,6.0
2,415.449,3.56620,4.0,4.0
3,264.325,2.47110,3.0,3.0
4,407.495,4.02992,3.0,4.0
...,...,...,...,...
8869,455.599,4.41730,4.0,4.0
8870,512.651,3.53350,5.0,5.0
8871,542.636,4.12960,5.0,5.0
8872,694.918,5.44710,4.0,9.0


In [24]:
df_combined = pd.concat([data.reset_index(drop=True), lipinski_df.reset_index(drop=True)], axis=1)
df_combined

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,MW,LogP,NumHDonors,NumHAcceptors
0,CHEMBL323869,O=C(CCCCCC(NC(=O)OCc1ccccc1)C(=O)Nc1cccc2cccnc...,2.5,464.522,3.92420,4.0,6.0
1,CHEMBL327146,O=C(CCCCCC(C(=O)Nc1ccc2ncccc2c1)C(=O)Nc1ccc2nc...,1.0,485.544,4.43230,4.0,6.0
2,CHEMBL116620,O=C(/C=C/c1cccc(C(C(=O)Nc2ccccc2)C(=O)Nc2ccccc...,1.0,415.449,3.56620,4.0,4.0
3,CHEMBL98,O=C(CCCCCCC(=O)Nc1ccccc1)NO,200.0,264.325,2.47110,3.0,3.0
4,CHEMBL346414,Cc1ccc(NS(=O)(=O)c2ccc(/C=C/C(=O)Nc3ccccc3N)cc...,2000.0,407.495,4.02992,3.0,4.0
...,...,...,...,...,...,...,...
8869,CHEMBL5432574,O=C(CCCCCCNC(=O)c1ccc(NC(=O)CC23C[C@H]4C[C@@H]...,192.3,455.599,4.41730,4.0,4.0
8870,CHEMBL5436873,O=C(CCCCCCNC(=O)c1ccc(NC(=O)CNC(=O)CC23C[C@H]4...,132.6,512.651,3.53350,5.0,5.0
8871,CHEMBL5434895,O=C(CCCCCCNC(=O)c1ccc(NC(=O)CNC(=O)CC2c3ccccc3...,86.4,542.636,4.12960,5.0,5.0
8872,CHEMBL5414971,O=C(CCCCCCNC(=O)c1ccc(-c2cn(CCCCCCOCCOCCNC(=O)...,349.9,694.918,5.44710,4.0,9.0


In [26]:
df_combined.standard_value.describe()

Unnamed: 0,standard_value
count,8874.0
mean,7747.383
std,59218.08
min,0.0
25%,40.0
50%,260.0
75%,2757.5
max,4100000.0


In [25]:
def pIC50(input):
    pIC50 = []

    for i in input['standard_value_norm']:
        molar = i*(10**-9) #converting from nM to M
        pIC50.append(-np.log10(molar))

    input['pIC50'] = pIC50
    x = input.drop("standard_value_norm", axis=1)

    return x

In [27]:
def norm_value(input):
    norm = []

    for i in input['standard_value']:
        if i > 100_000_000:
            i = 100_000_000
        norm.append(i)

    input['standard_value_norm'] = norm
    x = input.drop('standard_value', axis=1)

    return x

In [33]:
df_norm = norm_value(df_combined)
df_norm

Unnamed: 0,molecule_chembl_id,canonical_smiles,MW,LogP,NumHDonors,NumHAcceptors,standard_value_norm
0,CHEMBL323869,O=C(CCCCCC(NC(=O)OCc1ccccc1)C(=O)Nc1cccc2cccnc...,464.522,3.92420,4.0,6.0,2.5
1,CHEMBL327146,O=C(CCCCCC(C(=O)Nc1ccc2ncccc2c1)C(=O)Nc1ccc2nc...,485.544,4.43230,4.0,6.0,1.0
2,CHEMBL116620,O=C(/C=C/c1cccc(C(C(=O)Nc2ccccc2)C(=O)Nc2ccccc...,415.449,3.56620,4.0,4.0,1.0
3,CHEMBL98,O=C(CCCCCCC(=O)Nc1ccccc1)NO,264.325,2.47110,3.0,3.0,200.0
4,CHEMBL346414,Cc1ccc(NS(=O)(=O)c2ccc(/C=C/C(=O)Nc3ccccc3N)cc...,407.495,4.02992,3.0,4.0,2000.0
...,...,...,...,...,...,...,...
8869,CHEMBL5432574,O=C(CCCCCCNC(=O)c1ccc(NC(=O)CC23C[C@H]4C[C@@H]...,455.599,4.41730,4.0,4.0,192.3
8870,CHEMBL5436873,O=C(CCCCCCNC(=O)c1ccc(NC(=O)CNC(=O)CC23C[C@H]4...,512.651,3.53350,5.0,5.0,132.6
8871,CHEMBL5434895,O=C(CCCCCCNC(=O)c1ccc(NC(=O)CNC(=O)CC2c3ccccc3...,542.636,4.12960,5.0,5.0,86.4
8872,CHEMBL5414971,O=C(CCCCCCNC(=O)c1ccc(-c2cn(CCCCCCOCCOCCNC(=O)...,694.918,5.44710,4.0,9.0,349.9


In [38]:
df_norm = df_norm[df_norm["standard_value_norm"].notna()]
df_norm = df_norm[df_norm["standard_value_norm"] != 0]
df_norm

Unnamed: 0,molecule_chembl_id,canonical_smiles,MW,LogP,NumHDonors,NumHAcceptors,standard_value_norm
0,CHEMBL323869,O=C(CCCCCC(NC(=O)OCc1ccccc1)C(=O)Nc1cccc2cccnc...,464.522,3.92420,4.0,6.0,2.5
1,CHEMBL327146,O=C(CCCCCC(C(=O)Nc1ccc2ncccc2c1)C(=O)Nc1ccc2nc...,485.544,4.43230,4.0,6.0,1.0
2,CHEMBL116620,O=C(/C=C/c1cccc(C(C(=O)Nc2ccccc2)C(=O)Nc2ccccc...,415.449,3.56620,4.0,4.0,1.0
3,CHEMBL98,O=C(CCCCCCC(=O)Nc1ccccc1)NO,264.325,2.47110,3.0,3.0,200.0
4,CHEMBL346414,Cc1ccc(NS(=O)(=O)c2ccc(/C=C/C(=O)Nc3ccccc3N)cc...,407.495,4.02992,3.0,4.0,2000.0
...,...,...,...,...,...,...,...
8869,CHEMBL5432574,O=C(CCCCCCNC(=O)c1ccc(NC(=O)CC23C[C@H]4C[C@@H]...,455.599,4.41730,4.0,4.0,192.3
8870,CHEMBL5436873,O=C(CCCCCCNC(=O)c1ccc(NC(=O)CNC(=O)CC23C[C@H]4...,512.651,3.53350,5.0,5.0,132.6
8871,CHEMBL5434895,O=C(CCCCCCNC(=O)c1ccc(NC(=O)CNC(=O)CC2c3ccccc3...,542.636,4.12960,5.0,5.0,86.4
8872,CHEMBL5414971,O=C(CCCCCCNC(=O)c1ccc(-c2cn(CCCCCCOCCOCCNC(=O)...,694.918,5.44710,4.0,9.0,349.9


In [39]:
df_norm.standard_value_norm.describe()

Unnamed: 0,standard_value_norm
count,8872.0
mean,7749.13
std,59224.64
min,0.00687
25%,40.0
50%,260.0
75%,2760.0
max,4100000.0


In [40]:
df_final = pIC50(df_norm)
df_final

Unnamed: 0,molecule_chembl_id,canonical_smiles,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,CHEMBL323869,O=C(CCCCCC(NC(=O)OCc1ccccc1)C(=O)Nc1cccc2cccnc...,464.522,3.92420,4.0,6.0,8.602060
1,CHEMBL327146,O=C(CCCCCC(C(=O)Nc1ccc2ncccc2c1)C(=O)Nc1ccc2nc...,485.544,4.43230,4.0,6.0,9.000000
2,CHEMBL116620,O=C(/C=C/c1cccc(C(C(=O)Nc2ccccc2)C(=O)Nc2ccccc...,415.449,3.56620,4.0,4.0,9.000000
3,CHEMBL98,O=C(CCCCCCC(=O)Nc1ccccc1)NO,264.325,2.47110,3.0,3.0,6.698970
4,CHEMBL346414,Cc1ccc(NS(=O)(=O)c2ccc(/C=C/C(=O)Nc3ccccc3N)cc...,407.495,4.02992,3.0,4.0,5.698970
...,...,...,...,...,...,...,...
8869,CHEMBL5432574,O=C(CCCCCCNC(=O)c1ccc(NC(=O)CC23C[C@H]4C[C@@H]...,455.599,4.41730,4.0,4.0,6.716021
8870,CHEMBL5436873,O=C(CCCCCCNC(=O)c1ccc(NC(=O)CNC(=O)CC23C[C@H]4...,512.651,3.53350,5.0,5.0,6.877456
8871,CHEMBL5434895,O=C(CCCCCCNC(=O)c1ccc(NC(=O)CNC(=O)CC2c3ccccc3...,542.636,4.12960,5.0,5.0,7.063486
8872,CHEMBL5414971,O=C(CCCCCCNC(=O)c1ccc(-c2cn(CCCCCCOCCOCCNC(=O)...,694.918,5.44710,4.0,9.0,6.456056


In [41]:
df_final.pIC50.describe()

Unnamed: 0,pIC50
count,8872.0
mean,6.522978
std,1.252905
min,2.387216
25%,5.559091
50%,6.585027
75%,7.39794
max,11.163043


In [43]:
df_final.to_csv("/content/drive/MyDrive/broad_hackathon/data/HDAC_data_pIC50.csv", index=False)

Loading in Descriptors

In [44]:
descriptors = pd.read_csv("/content/drive/MyDrive/broad_hackathon/data/descriptors_output.csv")
descriptors

Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,CHEMBL98,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL95835,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL97387,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL95552,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL346414,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8867,CHEMBL5432574,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8868,CHEMBL5436873,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8869,CHEMBL5393987,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8870,CHEMBL5434895,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
descriptors=descriptors.drop(columns=["Name"])
descriptors

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8867,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8868,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8869,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8870,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [46]:
df_final_Y = df_final["pIC50"]
df_final_Y

Unnamed: 0,pIC50
0,8.602060
1,9.000000
2,9.000000
3,6.698970
4,5.698970
...,...
8869,6.716021
8870,6.877456
8871,7.063486
8872,6.456056


In [47]:
dataset = pd.concat([descriptors.reset_index(drop=True), df_final_Y.reset_index(drop=True)], axis=1)
dataset

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,pIC50
0,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,8.602060
1,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,9.000000
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,9.000000
3,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.698970
4,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,5.698970
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8867,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.716021
8868,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.877456
8869,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.063486
8870,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.456056


In [48]:
dataset.to_csv("/content/drive/MyDrive/broad_hackathon/data/HDAC_final_dataset_descriptors.csv", index=False)