# Data Collection with Papyrus

In [3]:
from qsprpred.data.sources.papyrus import Papyrus

acc_keys = ["Q12809"]
dataset_name = "hERG_Dataset"  # name of the file to be generated
quality = "high"  # choose minimum quality from {"high", "medium", "low"}
papyrus_version = "05.6"  # Papyrus database version
data_dir = "../Data"  # directory to store the Papyrus data
output_dir = "../Data"  # directory to store the generated dataset

# Create a Papyrus object, which specifies the version and directory to store the payrus data
papyrus = Papyrus(
    data_dir=data_dir,
    version=papyrus_version,
    stereo=False,
    plus_only=True,
)

# Create subset of payrus data for the given accession keys, returns a MoleculeTable
mt = papyrus.getData(
    dataset_name,
    acc_keys,
    quality,
    output_dir=output_dir,
    use_existing=True,
    activity_types=["Ki", "IC50", "Kd"]
)
mt.getDF().head()

Unnamed: 0_level_0,Activity_ID,Quality,source,CID,SMILES,connectivity,InChIKey,InChI,InChI_AuxInfo,target_id,...,Activity_class,relation,pchembl_value,pchembl_value_Mean,pchembl_value_StdDev,pchembl_value_SEM,pchembl_value_N,pchembl_value_Median,pchembl_value_MAD,QSPRID
QSPRID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
hERG_Dataset_00000,AACWUFIIMOHGSO_on_P29274_WT,High,ChEMBL31,ChEMBL31.compound.91968,Cc1nn(-c2cc(NC(=O)CCN(C)C)nc(-c3ccc(C)o3)n2)c(...,AACWUFIIMOHGSO,AACWUFIIMOHGSO-UHFFFAOYSA-N,InChI=1S/C19H24N6O2/c1-12-10-13(2)25(23-12)17-...,"""AuxInfo=1/1/N:1,26,22,14,15,20,19,11,12,27,6,...",P29274_WT,...,,=,8.68,8.68,0.0,0.0,1.0,8.68,0.0,hERG_Dataset_00000
hERG_Dataset_00001,AACWUFIIMOHGSO_on_P30542_WT,High,ChEMBL31,ChEMBL31.compound.91968,Cc1nn(-c2cc(NC(=O)CCN(C)C)nc(-c3ccc(C)o3)n2)c(...,AACWUFIIMOHGSO,AACWUFIIMOHGSO-UHFFFAOYSA-N,InChI=1S/C19H24N6O2/c1-12-10-13(2)25(23-12)17-...,"""AuxInfo=1/1/N:1,26,22,14,15,20,19,11,12,27,6,...",P30542_WT,...,,=,6.68,6.68,0.0,0.0,1.0,6.68,0.0,hERG_Dataset_00001
hERG_Dataset_00002,AAEYTMMNWWKSKZ_on_P29274_WT,High,ChEMBL31,ChEMBL31.compound.131451,Nc1c(C(=O)Nc2ccc([N+](=O)[O-])cc2)sc2c1cc1CCCC...,AAEYTMMNWWKSKZ,AAEYTMMNWWKSKZ-UHFFFAOYSA-N,InChI=1S/C18H16N4O3S/c19-15-13-9-10-3-1-2-4-14...,"""AuxInfo=1/1/N:22,23,21,24,8,15,9,14,19,20,7,1...",P29274_WT,...,,=,4.82,4.82,0.0,0.0,1.0,4.82,0.0,hERG_Dataset_00002
hERG_Dataset_00003,AAGFKZWKWAMJNP_on_P0DMS8_WT,High,ChEMBL31,ChEMBL31.compound.100375,O=C(Nc1nc2ncccc2n2c(=O)n(-c3ccccc3)nc12)c1ccccc1,AAGFKZWKWAMJNP,AAGFKZWKWAMJNP-UHFFFAOYSA-N,InChI=1S/C21H14N6O2/c28-20(14-8-3-1-4-9-14)24-...,"""AuxInfo=1/1/N:27,19,26,28,18,20,9,25,29,17,21...",P0DMS8_WT,...,,=,7.15,7.15,0.0,0.0,1.0,7.15,0.0,hERG_Dataset_00003
hERG_Dataset_00004,AAGFKZWKWAMJNP_on_P29274_WT,High,ChEMBL31,ChEMBL31.compound.100375,O=C(Nc1nc2ncccc2n2c(=O)n(-c3ccccc3)nc12)c1ccccc1,AAGFKZWKWAMJNP,AAGFKZWKWAMJNP-UHFFFAOYSA-N,InChI=1S/C21H14N6O2/c28-20(14-8-3-1-4-9-14)24-...,"""AuxInfo=1/1/N:27,19,26,28,18,20,9,25,29,17,21...",P29274_WT,...,,=,5.65,5.65,0.0,0.0,1.0,5.65,0.0,hERG_Dataset_00004


By default, the method returns a `MoleculeTable` so if you want to turn it into a `QSPRDataset` for modelling, you have to use the `fromMolTable` helper method. See the [data representation tutorial](data_representation.ipynb) for more details.


In [4]:
from qsprpred import TargetTasks
from qsprpred.data import QSPRDataset

target_props = [
    {"name": "pchembl_value_Median", "task": TargetTasks.SINGLECLASS, "th": [6.5]}]
ds = QSPRDataset.fromMolTable(mt, target_props=target_props)
ds.targetProperties

[TargetProperty(name=pchembl_value_Median, task=SINGLECLASS, th=[6.5])]

# Data preparation

In [15]:
# suppres numpy RuntimeWarning
import warnings

warnings.filterwarnings("ignore", category=RuntimeWarning)

In [16]:
import os

from qsprpred.data import QSPRDataset

os.makedirs("../output/data", exist_ok=True)

# Read receptor name from the text file
with open("../output/data/receptor.txt", "r") as file:
    receptor = file.read().strip()

dataset = QSPRDataset.fromTableFile(
    filename=f"../output/data/{receptor}_Dataset.tsv", 
    store_dir="../output/data",
    name=f"Preparation_{receptor}_Dataset",  
    target_props=[{"name": "pchembl_value_Mean", "task": "REGRESSION"}],
    random_state=42
)

dataset.getDF()

Unnamed: 0_level_0,Activity_ID,Quality,source,CID,SMILES,connectivity,InChIKey,InChI,InChI_AuxInfo,target_id,...,relation,pchembl_value,pchembl_value_Mean,pchembl_value_StdDev,pchembl_value_SEM,pchembl_value_N,pchembl_value_Median,pchembl_value_MAD,QSPRID,pchembl_value_Mean_original
QSPRID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Preparation_hERG_Dataset_00000,AACWUFIIMOHGSO_on_P29274_WT,High,ChEMBL31,ChEMBL31.compound.91968,Cc1nn(-c2cc(NC(=O)CCN(C)C)nc(-c3ccc(C)o3)n2)c(...,AACWUFIIMOHGSO,AACWUFIIMOHGSO-UHFFFAOYSA-N,InChI=1S/C19H24N6O2/c1-12-10-13(2)25(23-12)17-...,"""AuxInfo=1/1/N:1,26,22,14,15,20,19,11,12,27,6,...",P29274_WT,...,=,8.68,8.680,0.000000,0.000000,1.0,8.680,0.000,Preparation_hERG_Dataset_00000,8.680
Preparation_hERG_Dataset_00001,AACWUFIIMOHGSO_on_P30542_WT,High,ChEMBL31,ChEMBL31.compound.91968,Cc1nn(-c2cc(NC(=O)CCN(C)C)nc(-c3ccc(C)o3)n2)c(...,AACWUFIIMOHGSO,AACWUFIIMOHGSO-UHFFFAOYSA-N,InChI=1S/C19H24N6O2/c1-12-10-13(2)25(23-12)17-...,"""AuxInfo=1/1/N:1,26,22,14,15,20,19,11,12,27,6,...",P30542_WT,...,=,6.68,6.680,0.000000,0.000000,1.0,6.680,0.000,Preparation_hERG_Dataset_00001,6.680
Preparation_hERG_Dataset_00002,AAEYTMMNWWKSKZ_on_P29274_WT,High,ChEMBL31,ChEMBL31.compound.131451,Nc1c(C(=O)Nc2ccc([N+](=O)[O-])cc2)sc2c1cc1CCCC...,AAEYTMMNWWKSKZ,AAEYTMMNWWKSKZ-UHFFFAOYSA-N,InChI=1S/C18H16N4O3S/c19-15-13-9-10-3-1-2-4-14...,"""AuxInfo=1/1/N:22,23,21,24,8,15,9,14,19,20,7,1...",P29274_WT,...,=,4.82,4.820,0.000000,0.000000,1.0,4.820,0.000,Preparation_hERG_Dataset_00002,4.820
Preparation_hERG_Dataset_00003,AAGFKZWKWAMJNP_on_P0DMS8_WT,High,ChEMBL31,ChEMBL31.compound.100375,O=C(Nc1nc2ncccc2n2c(=O)n(-c3ccccc3)nc12)c1ccccc1,AAGFKZWKWAMJNP,AAGFKZWKWAMJNP-UHFFFAOYSA-N,InChI=1S/C21H14N6O2/c28-20(14-8-3-1-4-9-14)24-...,"""AuxInfo=1/1/N:27,19,26,28,18,20,9,25,29,17,21...",P0DMS8_WT,...,=,7.15,7.150,0.000000,0.000000,1.0,7.150,0.000,Preparation_hERG_Dataset_00003,7.150
Preparation_hERG_Dataset_00004,AAGFKZWKWAMJNP_on_P29274_WT,High,ChEMBL31,ChEMBL31.compound.100375,O=C(Nc1nc2ncccc2n2c(=O)n(-c3ccccc3)nc12)c1ccccc1,AAGFKZWKWAMJNP,AAGFKZWKWAMJNP-UHFFFAOYSA-N,InChI=1S/C21H14N6O2/c28-20(14-8-3-1-4-9-14)24-...,"""AuxInfo=1/1/N:27,19,26,28,18,20,9,25,29,17,21...",P29274_WT,...,=,5.65,5.650,0.000000,0.000000,1.0,5.650,0.000,Preparation_hERG_Dataset_00004,5.650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Preparation_hERG_Dataset_12435,ZXPDGTGMZKIESV_on_P29274_WT,High,ChEMBL31,ChEMBL31.compound.49348;ChEMBL31.compound.413675,CNC(=O)C12CC1C(n1cnc3c1nc(Cl)nc3NC1CCCC1)C(O)C2O,ZXPDGTGMZKIESV,ZXPDGTGMZKIESV-UHFFFAOYSA-N,InChI=1S/C18H23ClN6O3/c1-20-16(28)18-6-9(18)11...,"""AuxInfo=1/1/N:1,22,23,21,24,6,10,20,7,12,8,25...",P29274_WT,...,=,5.49;5.49,5.490,0.000000,0.000000,2.0,5.490,0.000,Preparation_hERG_Dataset_12435,5.490
Preparation_hERG_Dataset_12436,ZXPDGTGMZKIESV_on_P30542_WT,High,ChEMBL31,ChEMBL31.compound.49348;ChEMBL31.compound.413675,CNC(=O)C12CC1C(n1cnc3c1nc(Cl)nc3NC1CCCC1)C(O)C2O,ZXPDGTGMZKIESV,ZXPDGTGMZKIESV-UHFFFAOYSA-N,InChI=1S/C18H23ClN6O3/c1-20-16(28)18-6-9(18)11...,"""AuxInfo=1/1/N:1,22,23,21,24,6,10,20,7,12,8,25...",P30542_WT,...,=,7.74;7.74,7.740,0.000000,0.000000,2.0,7.740,0.000,Preparation_hERG_Dataset_12436,7.740
Preparation_hERG_Dataset_12437,ZYEXHNHGCDESIU_on_P0DMS8_WT,High,ChEMBL31,ChEMBL31.compound.1825,Nc1c2nc(C#Cc3ccccc3)n(C3OC(CO)C(O)C3O)c2ncn1,ZYEXHNHGCDESIU,ZYEXHNHGCDESIU-UHFFFAOYSA-N,InChI=1S/C18H17N5O4/c19-16-13-17(21-9-20-16)23...,"""AuxInfo=1/1/N:11,10,12,9,13,7,6,18,26,8,17,5,...",P0DMS8_WT,...,=,6.07;6.1,6.085,0.021213,0.015000,2.0,6.085,0.015,Preparation_hERG_Dataset_12437,6.085
Preparation_hERG_Dataset_12438,ZYQMTELMKXLVHN_on_P0DMS8_WT,High,ChEMBL31,ChEMBL31.compound.18654,CCCn1cc2c(n1)nc(NC(=O)Nc1ccccc1OC)n1nc(-c3ccco...,ZYQMTELMKXLVHN,ZYQMTELMKXLVHN-UHFFFAOYSA-N,InChI=1S/C21H20N8O3/c1-3-10-28-12-13-17(26-28)...,"""AuxInfo=1/1/N:1,22,2,17,18,28,16,19,27,3,29,5...",P0DMS8_WT,...,=,6.47;9.47;9.47,8.470,1.732051,1.000000,3.0,9.470,0.000,Preparation_hERG_Dataset_12438,8.470


### SMILES standardization

In [17]:
from qsprpred.data.descriptors.fingerprints import MorganFP
from qsprpred.data.sampling.splits import RandomSplit

# custom standardizer that canonicalizes the SMILES
def custom_standardizer(smiles):
    from rdkit import Chem
    mol = Chem.MolFromSmiles(smiles)
    smiles = Chem.MolToSmiles(mol, canonical=True)
    return smiles


# calculate compound features and split dataset into train and test
dataset.prepareDataset(
    smiles_standardizer=custom_standardizer,
    split=RandomSplit(test_fraction=0.2, dataset=dataset),
    feature_calculators=[MorganFP(radius=3, nBits=2048)],
    recalculate_features=True,
)

dataset.getDF().head()

Unnamed: 0_level_0,Activity_ID,Quality,source,CID,SMILES,connectivity,InChIKey,InChI,InChI_AuxInfo,target_id,...,relation,pchembl_value,pchembl_value_Mean,pchembl_value_StdDev,pchembl_value_SEM,pchembl_value_N,pchembl_value_Median,pchembl_value_MAD,QSPRID,pchembl_value_Mean_original
QSPRID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Preparation_hERG_Dataset_00000,AACWUFIIMOHGSO_on_P29274_WT,High,ChEMBL31,ChEMBL31.compound.91968,Cc1cc(C)n(-c2cc(NC(=O)CCN(C)C)nc(-c3ccc(C)o3)n...,AACWUFIIMOHGSO,AACWUFIIMOHGSO-UHFFFAOYSA-N,InChI=1S/C19H24N6O2/c1-12-10-13(2)25(23-12)17-...,"""AuxInfo=1/1/N:1,26,22,14,15,20,19,11,12,27,6,...",P29274_WT,...,=,8.68,8.68,0.0,0.0,1.0,8.68,0.0,Preparation_hERG_Dataset_00000,8.68
Preparation_hERG_Dataset_00001,AACWUFIIMOHGSO_on_P30542_WT,High,ChEMBL31,ChEMBL31.compound.91968,Cc1cc(C)n(-c2cc(NC(=O)CCN(C)C)nc(-c3ccc(C)o3)n...,AACWUFIIMOHGSO,AACWUFIIMOHGSO-UHFFFAOYSA-N,InChI=1S/C19H24N6O2/c1-12-10-13(2)25(23-12)17-...,"""AuxInfo=1/1/N:1,26,22,14,15,20,19,11,12,27,6,...",P30542_WT,...,=,6.68,6.68,0.0,0.0,1.0,6.68,0.0,Preparation_hERG_Dataset_00001,6.68
Preparation_hERG_Dataset_00002,AAEYTMMNWWKSKZ_on_P29274_WT,High,ChEMBL31,ChEMBL31.compound.131451,Nc1c(C(=O)Nc2ccc([N+](=O)[O-])cc2)sc2nc3c(cc12...,AAEYTMMNWWKSKZ,AAEYTMMNWWKSKZ-UHFFFAOYSA-N,InChI=1S/C18H16N4O3S/c19-15-13-9-10-3-1-2-4-14...,"""AuxInfo=1/1/N:22,23,21,24,8,15,9,14,19,20,7,1...",P29274_WT,...,=,4.82,4.82,0.0,0.0,1.0,4.82,0.0,Preparation_hERG_Dataset_00002,4.82
Preparation_hERG_Dataset_00003,AAGFKZWKWAMJNP_on_P0DMS8_WT,High,ChEMBL31,ChEMBL31.compound.100375,O=C(Nc1nc2ncccc2n2c(=O)n(-c3ccccc3)nc12)c1ccccc1,AAGFKZWKWAMJNP,AAGFKZWKWAMJNP-UHFFFAOYSA-N,InChI=1S/C21H14N6O2/c28-20(14-8-3-1-4-9-14)24-...,"""AuxInfo=1/1/N:27,19,26,28,18,20,9,25,29,17,21...",P0DMS8_WT,...,=,7.15,7.15,0.0,0.0,1.0,7.15,0.0,Preparation_hERG_Dataset_00003,7.15
Preparation_hERG_Dataset_00004,AAGFKZWKWAMJNP_on_P29274_WT,High,ChEMBL31,ChEMBL31.compound.100375,O=C(Nc1nc2ncccc2n2c(=O)n(-c3ccccc3)nc12)c1ccccc1,AAGFKZWKWAMJNP,AAGFKZWKWAMJNP-UHFFFAOYSA-N,InChI=1S/C21H14N6O2/c28-20(14-8-3-1-4-9-14)24-...,"""AuxInfo=1/1/N:27,19,26,28,18,20,9,25,29,17,21...",P29274_WT,...,=,5.65,5.65,0.0,0.0,1.0,5.65,0.0,Preparation_hERG_Dataset_00004,5.65


### Data filtering

In [18]:
import numpy as np

dataset.addProperty(name="FakeProperty",
                    data=np.random.choice(["Wow", "Nope"], len(dataset)))
dataset.getDF().head()

Unnamed: 0_level_0,Activity_ID,Quality,source,CID,SMILES,connectivity,InChIKey,InChI,InChI_AuxInfo,target_id,...,pchembl_value,pchembl_value_Mean,pchembl_value_StdDev,pchembl_value_SEM,pchembl_value_N,pchembl_value_Median,pchembl_value_MAD,QSPRID,pchembl_value_Mean_original,FakeProperty
QSPRID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Preparation_hERG_Dataset_00000,AACWUFIIMOHGSO_on_P29274_WT,High,ChEMBL31,ChEMBL31.compound.91968,Cc1cc(C)n(-c2cc(NC(=O)CCN(C)C)nc(-c3ccc(C)o3)n...,AACWUFIIMOHGSO,AACWUFIIMOHGSO-UHFFFAOYSA-N,InChI=1S/C19H24N6O2/c1-12-10-13(2)25(23-12)17-...,"""AuxInfo=1/1/N:1,26,22,14,15,20,19,11,12,27,6,...",P29274_WT,...,8.68,8.68,0.0,0.0,1.0,8.68,0.0,Preparation_hERG_Dataset_00000,8.68,Wow
Preparation_hERG_Dataset_00001,AACWUFIIMOHGSO_on_P30542_WT,High,ChEMBL31,ChEMBL31.compound.91968,Cc1cc(C)n(-c2cc(NC(=O)CCN(C)C)nc(-c3ccc(C)o3)n...,AACWUFIIMOHGSO,AACWUFIIMOHGSO-UHFFFAOYSA-N,InChI=1S/C19H24N6O2/c1-12-10-13(2)25(23-12)17-...,"""AuxInfo=1/1/N:1,26,22,14,15,20,19,11,12,27,6,...",P30542_WT,...,6.68,6.68,0.0,0.0,1.0,6.68,0.0,Preparation_hERG_Dataset_00001,6.68,Wow
Preparation_hERG_Dataset_00002,AAEYTMMNWWKSKZ_on_P29274_WT,High,ChEMBL31,ChEMBL31.compound.131451,Nc1c(C(=O)Nc2ccc([N+](=O)[O-])cc2)sc2nc3c(cc12...,AAEYTMMNWWKSKZ,AAEYTMMNWWKSKZ-UHFFFAOYSA-N,InChI=1S/C18H16N4O3S/c19-15-13-9-10-3-1-2-4-14...,"""AuxInfo=1/1/N:22,23,21,24,8,15,9,14,19,20,7,1...",P29274_WT,...,4.82,4.82,0.0,0.0,1.0,4.82,0.0,Preparation_hERG_Dataset_00002,4.82,Wow
Preparation_hERG_Dataset_00003,AAGFKZWKWAMJNP_on_P0DMS8_WT,High,ChEMBL31,ChEMBL31.compound.100375,O=C(Nc1nc2ncccc2n2c(=O)n(-c3ccccc3)nc12)c1ccccc1,AAGFKZWKWAMJNP,AAGFKZWKWAMJNP-UHFFFAOYSA-N,InChI=1S/C21H14N6O2/c28-20(14-8-3-1-4-9-14)24-...,"""AuxInfo=1/1/N:27,19,26,28,18,20,9,25,29,17,21...",P0DMS8_WT,...,7.15,7.15,0.0,0.0,1.0,7.15,0.0,Preparation_hERG_Dataset_00003,7.15,Nope
Preparation_hERG_Dataset_00004,AAGFKZWKWAMJNP_on_P29274_WT,High,ChEMBL31,ChEMBL31.compound.100375,O=C(Nc1nc2ncccc2n2c(=O)n(-c3ccccc3)nc12)c1ccccc1,AAGFKZWKWAMJNP,AAGFKZWKWAMJNP-UHFFFAOYSA-N,InChI=1S/C21H14N6O2/c28-20(14-8-3-1-4-9-14)24-...,"""AuxInfo=1/1/N:27,19,26,28,18,20,9,25,29,17,21...",P29274_WT,...,5.65,5.65,0.0,0.0,1.0,5.65,0.0,Preparation_hERG_Dataset_00004,5.65,Nope


In [19]:
from qsprpred.data.processing.data_filters import RepeatsFilter, CategoryFilter

# calculate compound features and split dataset into train and test
dataset.prepareDataset(
    data_filters=[RepeatsFilter(keep=False),
                  CategoryFilter(name="FakeProperty", values=["Wow"], keep=True)],
    # only keep compounds with FakeProperty="Wow"
    split=RandomSplit(test_fraction=0.2, dataset=dataset),
    feature_calculators=[MorganFP(radius=3, nBits=2048)],
    recalculate_features=True,
)

dataset.getDF().head()

Unnamed: 0_level_0,Activity_ID,Quality,source,CID,SMILES,connectivity,InChIKey,InChI,InChI_AuxInfo,target_id,...,pchembl_value,pchembl_value_Mean,pchembl_value_StdDev,pchembl_value_SEM,pchembl_value_N,pchembl_value_Median,pchembl_value_MAD,QSPRID,pchembl_value_Mean_original,FakeProperty
QSPRID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Preparation_hERG_Dataset_00000,AACWUFIIMOHGSO_on_P29274_WT,High,ChEMBL31,ChEMBL31.compound.91968,Cc1cc(C)n(-c2cc(NC(=O)CCN(C)C)nc(-c3ccc(C)o3)n...,AACWUFIIMOHGSO,AACWUFIIMOHGSO-UHFFFAOYSA-N,InChI=1S/C19H24N6O2/c1-12-10-13(2)25(23-12)17-...,"""AuxInfo=1/1/N:1,26,22,14,15,20,19,11,12,27,6,...",P29274_WT,...,8.68,8.68,0.0,0.0,1.0,8.68,0.0,Preparation_hERG_Dataset_00000,8.68,Wow
Preparation_hERG_Dataset_00001,AACWUFIIMOHGSO_on_P30542_WT,High,ChEMBL31,ChEMBL31.compound.91968,Cc1cc(C)n(-c2cc(NC(=O)CCN(C)C)nc(-c3ccc(C)o3)n...,AACWUFIIMOHGSO,AACWUFIIMOHGSO-UHFFFAOYSA-N,InChI=1S/C19H24N6O2/c1-12-10-13(2)25(23-12)17-...,"""AuxInfo=1/1/N:1,26,22,14,15,20,19,11,12,27,6,...",P30542_WT,...,6.68,6.68,0.0,0.0,1.0,6.68,0.0,Preparation_hERG_Dataset_00001,6.68,Wow
Preparation_hERG_Dataset_00002,AAEYTMMNWWKSKZ_on_P29274_WT,High,ChEMBL31,ChEMBL31.compound.131451,Nc1c(C(=O)Nc2ccc([N+](=O)[O-])cc2)sc2nc3c(cc12...,AAEYTMMNWWKSKZ,AAEYTMMNWWKSKZ-UHFFFAOYSA-N,InChI=1S/C18H16N4O3S/c19-15-13-9-10-3-1-2-4-14...,"""AuxInfo=1/1/N:22,23,21,24,8,15,9,14,19,20,7,1...",P29274_WT,...,4.82,4.82,0.0,0.0,1.0,4.82,0.0,Preparation_hERG_Dataset_00002,4.82,Wow
Preparation_hERG_Dataset_00005,AAISMSDTIOVVDP_on_P0DMS8_WT,High,ChEMBL31,ChEMBL31.compound.72023,CCNC(=O)C1OC(n2cnc3c(Nc4ccc(OCC(=O)Nc5ccc(Cl)c...,AAISMSDTIOVVDP,AAISMSDTIOVVDP-UHFFFAOYSA-N,InChI=1S/C26H26ClN7O6/c1-2-28-25(38)22-20(36)2...,"""AuxInfo=1/1/N:1,2,30,33,29,34,20,36,21,35,24,...",P0DMS8_WT,...,7.47,7.47,0.0,0.0,1.0,7.47,0.0,Preparation_hERG_Dataset_00005,7.47,Wow
Preparation_hERG_Dataset_00006,AAISMSDTIOVVDP_on_P30542_WT,High,ChEMBL31,ChEMBL31.compound.72023,CCNC(=O)C1OC(n2cnc3c(Nc4ccc(OCC(=O)Nc5ccc(Cl)c...,AAISMSDTIOVVDP,AAISMSDTIOVVDP-UHFFFAOYSA-N,InChI=1S/C26H26ClN7O6/c1-2-28-25(38)22-20(36)2...,"""AuxInfo=1/1/N:1,2,30,33,29,34,20,36,21,35,24,...",P30542_WT,...,8.51,8.51,0.0,0.0,1.0,8.51,0.0,Preparation_hERG_Dataset_00006,8.51,Wow


### Filling missing features

In [None]:
# calculate compound features and split dataset into train and test
dataset.prepareDataset(
    split=RandomSplit(test_fraction=0.2, dataset=dataset),
    feature_calculators=[MorganFP(radius=3, nBits=2048)],
    feature_fill_value=5,  # fill missing values with 5
    recalculate_features=True,
)

### Feature selection

In [None]:
from qsprpred.data.processing.feature_filters import HighCorrelationFilter

# calculate compound features and split dataset into train and test
dataset.prepareDataset(
    split=RandomSplit(test_fraction=0.2, dataset=dataset),
    feature_calculators=[MorganFP(radius=3, nBits=2048)],
    recalculate_features=True,
    feature_filters=[HighCorrelationFilter(th=0.95)]
    # remove features with correlation > 0.95
)

print(f"Number of fingerprint bits after filtering: {len(dataset.getDescriptors())}")

Number of fingerprint bits after filtering: 6186


  c /= stddev[:, None]
  c /= stddev[None, :]


### Feature standardization

In [None]:
from qsprpred.data.processing.feature_standardizers import SKLearnStandardizer
from sklearn.preprocessing import StandardScaler

# calculate compound features and split dataset into train and test
dataset.prepareDataset(
    split=RandomSplit(test_fraction=0.2, dataset=dataset),
    feature_calculators=[MorganFP(radius=3, nBits=2048)],
    recalculate_features=True,
    feature_standardizer=SKLearnStandardizer(StandardScaler())  # standardize features
)