<a href="https://colab.research.google.com/github/GianmarcoLuchetti/Bioavailability/blob/main/Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip3 install rdkit -q
!pip3 install mordred -q
!pip3 install session_info -q

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import session_info

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import PandasTools

from mordred import Calculator, descriptors

In [None]:
session_info.show()

# Dataset preparation

## Data from "Tingjun Hou, Junmei Wang, Wei Zhang, Xiaojie Xu, ADME evaluation in drug discovery. 6. If the oral bioavailability in human can be effectively predicted by simple molecular properties-based rules? Journal of Chemical Information and Modeling, 2007, 47, 460-463"

In [None]:
# Import the .sdf format datasets into a Pandas dataframe and calculate the SMILES strings of the compounds using the 'Structures' column
train_raw = PandasTools.LoadSDF('/content/bioavailability_version3_training_set.sdf', smilesName='SMILES',
                                molColName='Structure', includeFingerprints=False, removeHs=False, strictParsing=True)
test_raw = PandasTools.LoadSDF('/content/bioavailability_version3_test_set.sdf', smilesName='SMILES',
                                molColName='Structure', includeFingerprints=False, removeHs=False, strictParsing=True)

In [None]:
# Shape of raw datasets
print(f'Training set shape: {train_raw.shape}')
print(f'Test set shape: {test_raw.shape}')

Training set shape: (906, 22)
Test set shape: (80, 22)


In [None]:
# First five rows of the raw train dataset
train_raw.head(5)

Unnamed: 0,Molecule_name,Bioavailability,ACD_logS_Intrinsic,ACD_LogP,ACD_MW,ACD_PSA,ACD_FRB,ACD_Rule_Of_5,ACD_LogD_3,ACD_LogD_4,...,Num_H_total,Molecular_Volume,Molecular_SurfaceArea,RadOfGyration,ACD_LogD_2**2,ACD_LogD_3**2,ACD_MW**2,ID,SMILES,Structure
0,Loteprednol Etabonate,0,-5.25,3.1700001,466.95001,99.129997,8,0,3.1700001,3.1700001,...,8,315.54999,437.20001,4.3715563,10.048901,10.048901,218042.31,,[H]O[C@@]1([H])C([H])([H])[C@@]2(C([H])([H])[H...,<rdkit.Chem.rdchem.Mol object at 0x7e286416e260>
1,Sertaconazole,0,-6.1199999,7.4899998,437.76999,55.290001,6,1,7.0799999,7.4099998,...,2,268.22,383.84,4.4791269,39.564098,50.1264,191642.56,,[H]c1nc([H])n(C([H])([H])[C@]([H])(OC([H])([H]...,<rdkit.Chem.rdchem.Mol object at 0x7e286416e2d0>
2,Lapatinib,0,-8.0200005,5.1399999,581.06,114.73,10,2,4.77,5.0799999,...,9,364.26001,522.22998,6.6485019,15.210001,22.752899,337630.72,,[H]c1nc(N([H])c2c([H])c([H])c(OC([H])([H])c3c(...,<rdkit.Chem.rdchem.Mol object at 0x7e286416e340>
3,Clotrimazole,0,-4.8800001,5.4400001,344.84,17.82,4,1,5.29,5.4200001,...,1,226.72,320.69,3.6407769,22.3729,27.9841,118914.63,,[H]c1nc([H])n(C(c2c([H])c([H])c([H])c([H])c2[H...,<rdkit.Chem.rdchem.Mol object at 0x7e286416e3b0>
4,Lubiprostone,0,-3.3900001,2.8499999,390.45999,83.830002,12,0,1.12,0.25,...,7,264.45001,376.0,5.6376739,4.1615996,1.2544,152459.0,,[H]OC(=O)C([H])([H])C([H])([H])C([H])([H])C([H...,<rdkit.Chem.rdchem.Mol object at 0x7e286416e420>


In [None]:
# First five rows of the raw test dataset
test_raw.head(5)

Unnamed: 0,Molecule_name,Bioavailability,ACD_logS_Intrinsic,ACD_LogP,ACD_MW,ACD_PSA,ACD_FRB,ACD_Rule_Of_5,ACD_LogD_3,ACD_LogD_4,...,Num_H_total,Molecular_Volume,Molecular_SurfaceArea,RadOfGyration,ACD_LogD_2**2,ACD_LogD_3**2,ACD_MW**2,ID,SMILES,Structure
0,Arbekacin,0,0.25999999,-4.0,552.62,297.26999,20,3,-10.99,-10.43,...,26,361.17001,548.94,5.2950635,122.98811,120.7801,305388.84,,[H]OC([H])([H])[C@@]1([H])O[C@]([H])(O[C@]2([H...,<rdkit.Chem.rdchem.Mol object at 0x7e286320fe60>
1,Alcuronium,0,-0.89999998,-3.0699999,666.89001,46.939999,8,1,-3.0699999,-3.0699999,...,6,468.19,609.15997,5.2155375,9.4249001,9.4249001,444742.28,,[H]OC([H])([H])/C([H])=C1\C([H])([H])[N@+]2(C(...,<rdkit.Chem.rdchem.Mol object at 0x7e286320fed0>
2,Cephapirin,0,-3.8,0.79000002,423.45999,176.5,8,0,-2.6700001,-2.9100001,...,11,257.92999,385.10999,5.3757033,3.8416002,7.1289005,179318.36,,[H]OC(=O)C1=C(C([H])([H])OC(=O)C([H])([H])[H])...,<rdkit.Chem.rdchem.Mol object at 0x7e286320ff40>
3,Cefodizime,0,-6.1999998,2.55,584.66998,304.48001,10,3,-2.1700001,-2.23,...,17,352.60001,527.34003,6.0532088,2.7556,4.7089005,341839.0,,[H]OC(=O)C1=C(C([H])([H])Sc2nc(C([H])([H])[H])...,<rdkit.Chem.rdchem.Mol object at 0x7e2863268040>
4,Dihydroergotamine,1,-5.1100001,3.52,583.67999,118.21,5,2,1.23,2.05,...,9,384.14999,527.67999,5.5565372,0.37210003,1.5129,340682.34,,[H]O[C@]12O[C@@](N([H])C(=O)[C@]3([H])C([H])([...,<rdkit.Chem.rdchem.Mol object at 0x7e28632680b0>


## Data from "Min Wei, Xudong Zhang, Xiaolin Pan, Bo Wang, Changge Ji, Yifei Qi, and John Z.H. Zhang. HobPre: accurate prediction of human oral bioavailability for small molecules"

In [None]:
# additional dataset
train_raw_add = pd.read_csv('/content/hob_data_set.csv')

In [None]:
# additional dataset shape
print(f'Additional train set shape: {train_raw_add.shape}')

Additional train set shape: (1157, 7)


In [None]:
# additional dataset head
train_raw_add.head(5)

Unnamed: 0,Name,SMILES,value,label_cutoff_50%,our model predition_50%,label_cutoff_20%,our model predition_20%
0,sulfadiazine,Nc1ccc(S(=O)(=O)Nc2ncccn2)cc1,90,1,1,1.0,1.0
1,clofarabine,Nc1nc(Cl)nc2c1ncn2C1OC(CO)C(O)C1F,50,1,1,1.0,1.0
2,sulfamethoxazole,Cc1cc(NS(=O)(=O)c2ccc(N)cc2)no1,99,1,1,1.0,1.0
3,tolazoline,c1ccc(CC2=NCCN2)cc1,90,1,1,1.0,1.0
4,cotinine,CN1C(=O)CCC1c1cccnc1,97,1,1,1.0,1.0


## Molecular descriptor calculation

In [None]:
# computation of canonical SMILES strings using RDKit
def canonical_smiles(smiles):
    mols = [Chem.MolFromSmiles(smi) for smi in smiles]
    smiles = [Chem.MolToSmiles(mol) for mol in mols]
    return smiles

In [None]:
# computation of mordred molecular descriptors that are then saved in a pandas dataframe
def Mordred_descriptors(data):
    calc = Calculator(descriptors, ignore_3D=False)
    mols = [Chem.MolFromSmiles(smi) for smi in data]

    df = calc.pandas(mols)
    return df

In [None]:
can_train_raw = canonical_smiles(train_raw.SMILES) # computation of canonical SMILES for train_raw
train_desc = Mordred_descriptors(can_train_raw) # computing of molecular descriptors
train_desc = train_desc.astype(float).fillna(0) # replacement of NaN values with 0

train_desc.insert(loc=0, column='SMILES', value=can_train_raw) # addition of SMILES and Bioavailability columns
train_desc.insert(loc=1, column='Bioavailability', value=train_raw.Bioavailability)
train_desc

  1%|          | 10/906 [00:05<07:09,  2.08it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 21%|██▏       | 193/906 [01:11<04:31,  2.63it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 906/906 [04:49<00:00,  3.13it/s]


Unnamed: 0,SMILES,Bioavailability,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,CCOC(=O)O[C@]1(C(=O)OCCl)CC[C@H]2[C@@H]3CCC4=C...,0,25.041641,20.572813,0.0,0.0,39.956644,2.682354,5.300749,39.956644,...,10.897128,83.824745,466.175831,7.399616,2580.0,68.0,180.0,227.0,12.326389,6.979167
1,Clc1ccc([C@H](Cn2ccnc2)OCc2csc3c(Cl)cccc23)c(C...,0,21.339172,17.474154,0.0,0.0,35.116773,2.437888,4.795308,35.116773,...,10.178882,77.812555,435.997067,10.380883,1886.0,40.0,144.0,170.0,7.750000,5.888889
2,CS(=O)(=O)CCNCc1ccc(-c2ccc3ncnc(Nc4ccc(OCc5ccc...,0,31.646473,21.395324,0.0,1.0,51.186875,2.446018,4.875514,51.186875,...,10.537919,90.654153,580.134732,8.789920,6645.0,59.0,212.0,244.0,12.034722,8.569444
3,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,0,19.621501,16.811403,0.0,0.0,33.831740,2.579150,5.142419,33.831740,...,10.369640,73.476522,344.108026,8.193048,1225.0,44.0,134.0,164.0,6.118056,5.611111
4,CCCCC(F)(F)[C@@]1(O)CC[C@H]2[C@@H](CC(=O)[C@@H...,0,20.387860,16.599062,1.0,0.0,32.777540,2.533516,5.006933,32.777540,...,10.251571,76.398181,390.221781,6.613928,2140.0,42.0,136.0,158.0,10.930556,6.020833
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
901,O=C(O)[C@H]1/C(=C\CO)O[C@@H]2CC(=O)N21,75,10.732571,10.433685,1.0,0.0,17.024871,2.547264,4.937492,17.024871,...,9.914971,61.392844,199.048072,8.654264,285.0,19.0,74.0,90.0,5.666667,3.138889
902,O=C(OCCN1CCN(c2cccc(C(F)(F)F)c2)CC1)c1ccccc1Nc...,100,33.264394,22.711274,0.0,1.0,53.423257,2.445642,4.891283,53.423257,...,10.718697,79.627039,588.195995,8.649941,7508.0,69.0,226.0,264.0,13.847222,8.888889
903,O=C1C(O)=C(O)O[C@@H]1[C@@H](O)CO,100,8.761080,9.017795,0.0,0.0,14.688134,2.402549,4.592641,14.688134,...,9.261509,56.914288,176.032088,8.801604,188.0,17.0,58.0,68.0,6.055556,2.777778
904,CN(C)[C@@H]1C(O)=C(C(=O)NCN[C@@H](CCCCN)C(=O)O...,100,33.263797,26.609917,1.0,3.0,52.600841,2.676991,5.353981,52.600841,...,11.091529,81.389883,602.258793,7.435294,6244.0,91.0,234.0,291.0,18.652778,9.361111


In [None]:
can_test_raw = canonical_smiles(test_raw.SMILES) # computation of canonical SMILES for test_raw
test_desc = Mordred_descriptors(can_test_raw) # computing of molecular descriptors
test_desc = test_desc.astype(float).fillna(0) # replacement of NaN values with 0

test_desc.insert(loc=0, column='SMILES', value=can_test_raw) # addition of SMILES and Bioavailability columns
test_desc.insert(loc=1, column='Bioavailability', value=test_raw.Bioavailability)
test_desc

100%|██████████| 80/80 [00:28<00:00,  2.85it/s]


Unnamed: 0,SMILES,Bioavailability,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,NCC[C@H](O)C(=O)N[C@@H]1C[C@H](N)[C@@H](O[C@H]...,0,28.754989,23.660827,0.0,5.0,47.373524,2.474883,4.949765,47.373524,...,10.582561,74.881895,552.311892,6.735511,4560.0,67.0,194.0,230.0,15.666667,8.611111
1,C=CC[N@@+]12CC[C@]34c5ccccc5N5/C=C6/[C@@H]7C[C...,0,41.448249,28.387504,0.0,2.0,68.136959,2.790534,5.415390,68.136959,...,11.562867,109.064808,666.392280,6.663923,7363.0,126.0,316.0,420.0,12.527778,10.666667
2,CC(=O)OCC1=C(C(=O)O)N2C(=O)[C@@H](NC(=O)CSc3cc...,0,21.586461,16.990294,1.0,0.0,35.446415,2.571426,5.142851,35.446415,...,10.453370,63.776322,423.055877,9.401242,2292.0,41.0,144.0,169.0,10.111111,6.194444
3,CO/N=C(/C(=O)N[C@@H]1C(=O)N2C(C(=O)O)=C(CSc3nc...,0,28.754989,22.911571,2.0,0.0,46.360396,2.575082,5.149065,46.360396,...,10.721967,89.984830,584.027631,10.246099,4854.0,57.0,196.0,234.0,13.916667,8.111111
4,CN1C[C@@H](C(=O)N[C@@]2(C)O[C@]3(O)[C@H]4CCCN4...,1,35.440625,26.286287,0.0,1.0,56.940377,2.669553,5.185190,56.940377,...,11.147080,99.273521,583.279469,7.290993,6191.0,88.0,258.0,326.0,12.652778,8.875000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,Cc1cnc(NC(=O)C2=C(O)c3ccccc3S(=O)(=O)N2C)s1,97,18.107646,15.622893,0.0,0.0,28.796076,2.548495,5.095860,28.796076,...,10.324531,71.564955,351.034748,9.750965,1115.0,41.0,126.0,153.0,8.951389,4.888889
76,CN1c2ccccc2N(CCCCCCC(=O)O)c2ccc(Cl)cc2S1(=O)=O,99,21.643180,17.251851,1.0,0.0,35.295882,2.552301,4.990973,35.295882,...,10.337573,71.834384,422.106706,8.276602,2021.0,51.0,146.0,173.0,10.201389,6.138889
77,Cc1cc(=O)n(-c2ccccc2)n1C,99,10.773011,10.024453,0.0,0.0,17.688005,2.430944,4.721130,17.688005,...,9.509037,60.036603,188.094963,7.234422,284.0,20.0,72.0,85.0,5.055556,3.111111
78,O=NN(CCCl)C(=O)NC1CCCCC1,100,10.675551,9.974681,0.0,0.0,18.770020,2.241154,4.482308,18.770020,...,9.073489,45.723333,233.093104,7.519132,402.0,18.0,66.0,72.0,5.583333,3.694444


In [None]:
can_train_raw_add = canonical_smiles(train_raw_add.SMILES) # computation of canonical SMILES for train_raw_add
train_desc_add = Mordred_descriptors(can_train_raw_add) # computing of molecular descriptors
train_desc_add = train_desc_add.astype(float).fillna(0) # replacement of NaN values with 0

train_desc_add.insert(loc=0, column='SMILES', value=can_train_raw_add) # addition of SMILES and Bioavailability columns
train_desc_add.insert(loc=1, column='Bioavailability', value=train_raw_add.value)
train_desc_add

 87%|████████▋ | 1011/1157 [05:43<02:14,  1.09it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 88%|████████▊ | 1023/1157 [05:47<00:50,  2.64it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 1157/1157 [06:15<00:00,  3.08it/s]


Unnamed: 0,SMILES,Bioavailability,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,Nc1ccc(S(=O)(=O)Nc2ncccn2)cc1,90,13.093540,11.115492,0.0,0.0,21.139069,2.370239,4.740478,21.139069,...,9.645817,49.622129,250.052447,9.261202,536.0,23.0,86.0,97.0,5.895833,3.708333
1,Nc1nc(Cl)nc2c1ncn2C1OC(CO)C(O)C1F,50,15.710828,14.057406,0.0,0.0,25.397628,2.520227,4.881224,25.397628,...,10.082679,70.336447,303.053445,9.775918,753.0,34.0,110.0,135.0,7.500000,4.361111
2,Cc1cc(NS(=O)(=O)c2ccc(N)cc2)no1,99,13.202929,11.709699,0.0,0.0,20.712705,2.383498,4.742568,20.712705,...,9.663770,63.242131,253.052112,9.037575,535.0,22.0,88.0,100.0,6.506944,3.625000
3,c1ccc(CC2=NCCN2)cc1,90,9.192388,8.248586,0.0,2.0,15.982234,2.246428,4.436582,15.982234,...,8.907477,55.114662,160.100048,6.670835,209.0,11.0,58.0,64.0,2.722222,2.750000
4,CN1C(=O)CCC1c1cccnc1,97,9.996954,9.253985,0.0,0.0,16.886871,2.383381,4.647225,16.886871,...,9.339525,58.137510,176.094963,7.043799,238.0,17.0,66.0,77.0,4.194444,2.916667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1152,Clc1cccc(Cl)c1NC1=NCCN1,95,10.744501,9.870405,0.0,3.0,17.881594,2.335590,4.651027,17.881594,...,9.342333,58.496842,229.017353,9.957276,301.0,17.0,70.0,80.0,4.444444,3.138889
1153,N=C(N)N=C(O)Cc1c(Cl)cccc1Cl,81,11.072670,10.311148,0.0,3.0,17.507310,2.318438,4.636876,17.507310,...,9.306650,46.323687,245.012267,10.208844,384.0,19.0,70.0,77.0,6.805556,3.388889
1154,CN(C(=O)C(Cl)Cl)c1ccc(O)cc1,90,10.325124,9.704472,0.0,0.0,16.814625,2.307250,4.614501,16.814625,...,9.282568,45.126593,233.001034,10.130480,312.0,19.0,66.0,74.0,6.555556,3.166667
1155,O=P([O-])([O-])C(Cl)(Cl)P(=O)(O)O,1,8.152948,8.723360,4.0,0.0,10.363081,2.449490,4.898979,10.363081,...,9.682903,42.374286,241.871464,18.605497,136.0,18.0,56.0,64.0,8.187500,2.125000


## Label assignment

In [None]:
# create label list for train
column = train_desc['Bioavailability']

# class assignment (0: <= 35%, 1: > 35% & <= 70%, 2 > 70%)
label = []
for ind in train_desc.index:
  if float(column[ind]) <= 35:
    label.append('0')
  elif float(column[ind]) > 35 and float(column[ind]) <= 70:
    label.append('1')
  elif float(column[ind]) > 70:
      label.append('2')


train_desc.insert(loc=2, column='Bioavailability_class', value=label)
train_desc

Unnamed: 0,SMILES,Bioavailability,Bioavailability_class,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,CCOC(=O)O[C@]1(C(=O)OCCl)CC[C@H]2[C@@H]3CCC4=C...,0,0,25.041641,20.572813,0.0,0.0,39.956644,2.682354,5.300749,...,10.897128,83.824745,466.175831,7.399616,2580.0,68.0,180.0,227.0,12.326389,6.979167
1,Clc1ccc([C@H](Cn2ccnc2)OCc2csc3c(Cl)cccc23)c(C...,0,0,21.339172,17.474154,0.0,0.0,35.116773,2.437888,4.795308,...,10.178882,77.812555,435.997067,10.380883,1886.0,40.0,144.0,170.0,7.750000,5.888889
2,CS(=O)(=O)CCNCc1ccc(-c2ccc3ncnc(Nc4ccc(OCc5ccc...,0,0,31.646473,21.395324,0.0,1.0,51.186875,2.446018,4.875514,...,10.537919,90.654153,580.134732,8.789920,6645.0,59.0,212.0,244.0,12.034722,8.569444
3,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,0,0,19.621501,16.811403,0.0,0.0,33.831740,2.579150,5.142419,...,10.369640,73.476522,344.108026,8.193048,1225.0,44.0,134.0,164.0,6.118056,5.611111
4,CCCCC(F)(F)[C@@]1(O)CC[C@H]2[C@@H](CC(=O)[C@@H...,0,0,20.387860,16.599062,1.0,0.0,32.777540,2.533516,5.006933,...,10.251571,76.398181,390.221781,6.613928,2140.0,42.0,136.0,158.0,10.930556,6.020833
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
901,O=C(O)[C@H]1/C(=C\CO)O[C@@H]2CC(=O)N21,75,2,10.732571,10.433685,1.0,0.0,17.024871,2.547264,4.937492,...,9.914971,61.392844,199.048072,8.654264,285.0,19.0,74.0,90.0,5.666667,3.138889
902,O=C(OCCN1CCN(c2cccc(C(F)(F)F)c2)CC1)c1ccccc1Nc...,100,2,33.264394,22.711274,0.0,1.0,53.423257,2.445642,4.891283,...,10.718697,79.627039,588.195995,8.649941,7508.0,69.0,226.0,264.0,13.847222,8.888889
903,O=C1C(O)=C(O)O[C@@H]1[C@@H](O)CO,100,2,8.761080,9.017795,0.0,0.0,14.688134,2.402549,4.592641,...,9.261509,56.914288,176.032088,8.801604,188.0,17.0,58.0,68.0,6.055556,2.777778
904,CN(C)[C@@H]1C(O)=C(C(=O)NCN[C@@H](CCCCN)C(=O)O...,100,2,33.263797,26.609917,1.0,3.0,52.600841,2.676991,5.353981,...,11.091529,81.389883,602.258793,7.435294,6244.0,91.0,234.0,291.0,18.652778,9.361111


In [None]:
# create label list for test
column = test_desc['Bioavailability']

# class assignment (0: <= 35%, 1: > 35% & <= 70%, 2 > 70%)
label = []
for ind in test_desc.index:
  if float(column[ind]) <= 35:
    label.append('0')
  elif float(column[ind]) > 35 and float(column[ind]) <= 70:
    label.append('1')
  elif float(column[ind]) > 70:
      label.append('2')

test_desc.insert(loc=2, column='Bioavailability_class', value=label)
test_desc

Unnamed: 0,SMILES,Bioavailability,Bioavailability_class,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,NCC[C@H](O)C(=O)N[C@@H]1C[C@H](N)[C@@H](O[C@H]...,0,0,28.754989,23.660827,0.0,5.0,47.373524,2.474883,4.949765,...,10.582561,74.881895,552.311892,6.735511,4560.0,67.0,194.0,230.0,15.666667,8.611111
1,C=CC[N@@+]12CC[C@]34c5ccccc5N5/C=C6/[C@@H]7C[C...,0,0,41.448249,28.387504,0.0,2.0,68.136959,2.790534,5.415390,...,11.562867,109.064808,666.392280,6.663923,7363.0,126.0,316.0,420.0,12.527778,10.666667
2,CC(=O)OCC1=C(C(=O)O)N2C(=O)[C@@H](NC(=O)CSc3cc...,0,0,21.586461,16.990294,1.0,0.0,35.446415,2.571426,5.142851,...,10.453370,63.776322,423.055877,9.401242,2292.0,41.0,144.0,169.0,10.111111,6.194444
3,CO/N=C(/C(=O)N[C@@H]1C(=O)N2C(C(=O)O)=C(CSc3nc...,0,0,28.754989,22.911571,2.0,0.0,46.360396,2.575082,5.149065,...,10.721967,89.984830,584.027631,10.246099,4854.0,57.0,196.0,234.0,13.916667,8.111111
4,CN1C[C@@H](C(=O)N[C@@]2(C)O[C@]3(O)[C@H]4CCCN4...,1,0,35.440625,26.286287,0.0,1.0,56.940377,2.669553,5.185190,...,11.147080,99.273521,583.279469,7.290993,6191.0,88.0,258.0,326.0,12.652778,8.875000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,Cc1cnc(NC(=O)C2=C(O)c3ccccc3S(=O)(=O)N2C)s1,97,2,18.107646,15.622893,0.0,0.0,28.796076,2.548495,5.095860,...,10.324531,71.564955,351.034748,9.750965,1115.0,41.0,126.0,153.0,8.951389,4.888889
76,CN1c2ccccc2N(CCCCCCC(=O)O)c2ccc(Cl)cc2S1(=O)=O,99,2,21.643180,17.251851,1.0,0.0,35.295882,2.552301,4.990973,...,10.337573,71.834384,422.106706,8.276602,2021.0,51.0,146.0,173.0,10.201389,6.138889
77,Cc1cc(=O)n(-c2ccccc2)n1C,99,2,10.773011,10.024453,0.0,0.0,17.688005,2.430944,4.721130,...,9.509037,60.036603,188.094963,7.234422,284.0,20.0,72.0,85.0,5.055556,3.111111
78,O=NN(CCCl)C(=O)NC1CCCCC1,100,2,10.675551,9.974681,0.0,0.0,18.770020,2.241154,4.482308,...,9.073489,45.723333,233.093104,7.519132,402.0,18.0,66.0,72.0,5.583333,3.694444


In [None]:
# removal of rows where symbols '<', '>', '≈' are contained
train_desc_add = train_desc_add[~train_desc_add.Bioavailability.str.contains('<')]
train_desc_add = train_desc_add[~train_desc_add.Bioavailability.str.contains('>')]
train_desc_add = train_desc_add[~train_desc_add.Bioavailability.str.contains('≈')]

# create label list for train_add
column = train_desc_add['Bioavailability']

# class assignment (0: <= 35%, 1: > 35% & <= 70%, 2 > 70%)
label = []
for ind in train_desc_add.index:
  if float(column[ind]) <= 35:
    label.append('0')
  elif float(column[ind]) > 35 and float(column[ind]) <= 70:
    label.append('1')
  elif float(column[ind]) > 70:
      label.append('2')

train_desc_add.insert(loc=2, column='Bioavailability_class', value=label)
train_desc_add

Unnamed: 0,SMILES,Bioavailability,Bioavailability_class,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,Nc1ccc(S(=O)(=O)Nc2ncccn2)cc1,90,2,13.093540,11.115492,0.0,0.0,21.139069,2.370239,4.740478,...,9.645817,49.622129,250.052447,9.261202,536.0,23.0,86.0,97.0,5.895833,3.708333
1,Nc1nc(Cl)nc2c1ncn2C1OC(CO)C(O)C1F,50,1,15.710828,14.057406,0.0,0.0,25.397628,2.520227,4.881224,...,10.082679,70.336447,303.053445,9.775918,753.0,34.0,110.0,135.0,7.500000,4.361111
2,Cc1cc(NS(=O)(=O)c2ccc(N)cc2)no1,99,2,13.202929,11.709699,0.0,0.0,20.712705,2.383498,4.742568,...,9.663770,63.242131,253.052112,9.037575,535.0,22.0,88.0,100.0,6.506944,3.625000
3,c1ccc(CC2=NCCN2)cc1,90,2,9.192388,8.248586,0.0,2.0,15.982234,2.246428,4.436582,...,8.907477,55.114662,160.100048,6.670835,209.0,11.0,58.0,64.0,2.722222,2.750000
4,CN1C(=O)CCC1c1cccnc1,97,2,9.996954,9.253985,0.0,0.0,16.886871,2.383381,4.647225,...,9.339525,58.137510,176.094963,7.043799,238.0,17.0,66.0,77.0,4.194444,2.916667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1152,Clc1cccc(Cl)c1NC1=NCCN1,95,2,10.744501,9.870405,0.0,3.0,17.881594,2.335590,4.651027,...,9.342333,58.496842,229.017353,9.957276,301.0,17.0,70.0,80.0,4.444444,3.138889
1153,N=C(N)N=C(O)Cc1c(Cl)cccc1Cl,81,2,11.072670,10.311148,0.0,3.0,17.507310,2.318438,4.636876,...,9.306650,46.323687,245.012267,10.208844,384.0,19.0,70.0,77.0,6.805556,3.388889
1154,CN(C(=O)C(Cl)Cl)c1ccc(O)cc1,90,2,10.325124,9.704472,0.0,0.0,16.814625,2.307250,4.614501,...,9.282568,45.126593,233.001034,10.130480,312.0,19.0,66.0,74.0,6.555556,3.166667
1155,O=P([O-])([O-])C(Cl)(Cl)P(=O)(O)O,1,0,8.152948,8.723360,4.0,0.0,10.363081,2.449490,4.898979,...,9.682903,42.374286,241.871464,18.605497,136.0,18.0,56.0,64.0,8.187500,2.125000


## Definitive dataset

In [None]:
# union of train and train_add
df_train = pd.concat([train_desc, train_desc_add])
df_train.shape

(1957, 1829)

In [None]:
# check for duplicated values
df_train.duplicated(subset=['SMILES']).to_list().count(True)

140

In [None]:
# delete duplicated values
df_train = df_train.drop_duplicates(subset=['SMILES'], keep='first')
df_train.duplicated(subset=['SMILES']).to_list().count(True)

0

In [None]:
df_train = df_train.dropna(how='any')
df_train.reset_index(drop=True, inplace=True)
df_train

Unnamed: 0,SMILES,Bioavailability,Bioavailability_class,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,CCOC(=O)O[C@]1(C(=O)OCCl)CC[C@H]2[C@@H]3CCC4=C...,0,0,25.041641,20.572813,0.0,0.0,39.956644,2.682354,5.300749,...,10.897128,83.824745,466.175831,7.399616,2580.0,68.0,180.0,227.0,12.326389,6.979167
1,Clc1ccc([C@H](Cn2ccnc2)OCc2csc3c(Cl)cccc23)c(C...,0,0,21.339172,17.474154,0.0,0.0,35.116773,2.437888,4.795308,...,10.178882,77.812555,435.997067,10.380883,1886.0,40.0,144.0,170.0,7.750000,5.888889
2,CS(=O)(=O)CCNCc1ccc(-c2ccc3ncnc(Nc4ccc(OCc5ccc...,0,0,31.646473,21.395324,0.0,1.0,51.186875,2.446018,4.875514,...,10.537919,90.654153,580.134732,8.789920,6645.0,59.0,212.0,244.0,12.034722,8.569444
3,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,0,0,19.621501,16.811403,0.0,0.0,33.831740,2.579150,5.142419,...,10.369640,73.476522,344.108026,8.193048,1225.0,44.0,134.0,164.0,6.118056,5.611111
4,CCCCC(F)(F)[C@@]1(O)CC[C@H]2[C@@H](CC(=O)[C@@H...,0,0,20.387860,16.599062,1.0,0.0,32.777540,2.533516,5.006933,...,10.251571,76.398181,390.221781,6.613928,2140.0,42.0,136.0,158.0,10.930556,6.020833
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1812,Cn1cnc([N+](=O)[O-])c1Sc1ncnc2[nH]cnc12,60,1,14.975211,13.540325,0.0,0.0,24.469012,2.441186,4.779117,...,9.878836,68.367678,277.038193,10.655315,676.0,28.0,102.0,122.0,6.027778,4.138889
1813,Clc1cccc(Cl)c1NC1=NCCN1,95,2,10.744501,9.870405,0.0,3.0,17.881594,2.335590,4.651027,...,9.342333,58.496842,229.017353,9.957276,301.0,17.0,70.0,80.0,4.444444,3.138889
1814,N=C(N)N=C(O)Cc1c(Cl)cccc1Cl,81,2,11.072670,10.311148,0.0,3.0,17.507310,2.318438,4.636876,...,9.306650,46.323687,245.012267,10.208844,384.0,19.0,70.0,77.0,6.805556,3.388889
1815,O=P([O-])([O-])C(Cl)(Cl)P(=O)(O)O,1,0,8.152948,8.723360,4.0,0.0,10.363081,2.449490,4.898979,...,9.682903,42.374286,241.871464,18.605497,136.0,18.0,56.0,64.0,8.187500,2.125000


In [None]:
sampled_compounds = df_train.sample(n=100, random_state=42)  # Set random_state for reproducibility

# Add the sampled compounds to df_test
df_test = pd.concat([test_desc, sampled_compounds], ignore_index=True)

# Delete the sampled compounds from df_train
df_train = df_train.drop(sampled_compounds.index)

df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [None]:
# check for duplicated values
df_test.duplicated(subset=['SMILES']).to_list().count(True)

5

In [None]:
# delete duplicated
df_test = df_test.drop_duplicates(subset=['SMILES'], keep='first')
df_test.duplicated(subset=['SMILES']).to_list().count(True)

0

In [None]:
df_test = df_test.dropna(how='any')
df_test.reset_index(drop=True, inplace=True)
df_test

Unnamed: 0,SMILES,Bioavailability,Bioavailability_class,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,NCC[C@H](O)C(=O)N[C@@H]1C[C@H](N)[C@@H](O[C@H]...,0,0,28.754989,23.660827,0.0,5.0,47.373524,2.474883,4.949765,...,10.582561,74.881895,552.311892,6.735511,4560.0,67.0,194.0,230.0,15.666667,8.611111
1,C=CC[N@@+]12CC[C@]34c5ccccc5N5/C=C6/[C@@H]7C[C...,0,0,41.448249,28.387504,0.0,2.0,68.136959,2.790534,5.415390,...,11.562867,109.064808,666.392280,6.663923,7363.0,126.0,316.0,420.0,12.527778,10.666667
2,CC(=O)OCC1=C(C(=O)O)N2C(=O)[C@@H](NC(=O)CSc3cc...,0,0,21.586461,16.990294,1.0,0.0,35.446415,2.571426,5.142851,...,10.453370,63.776322,423.055877,9.401242,2292.0,41.0,144.0,169.0,10.111111,6.194444
3,CO/N=C(/C(=O)N[C@@H]1C(=O)N2C(C(=O)O)=C(CSc3nc...,0,0,28.754989,22.911571,2.0,0.0,46.360396,2.575082,5.149065,...,10.721967,89.984830,584.027631,10.246099,4854.0,57.0,196.0,234.0,13.916667,8.111111
4,CN1C[C@@H](C(=O)N[C@@]2(C)O[C@]3(O)[C@H]4CCCN4...,1,0,35.440625,26.286287,0.0,1.0,56.940377,2.669553,5.185190,...,11.147080,99.273521,583.279469,7.290993,6191.0,88.0,258.0,326.0,12.652778,8.875000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,[H]/N=C(/N)N/C(=N/[H])N(C)C,52,1,7.358797,7.884672,0.0,5.0,12.617971,2.169236,4.338473,...,8.548886,39.497463,129.101445,6.455072,168.0,12.0,44.0,46.0,6.083333,2.777778
171,COC1C=COC2(C)Oc3c(C)c(O)c4c(O)c(c5c(c4c3C2=O)N...,20,0,47.707791,33.594676,0.0,1.0,76.496594,2.670505,5.236782,...,11.262835,117.325136,846.441509,6.881638,15031.0,121.0,334.0,408.0,25.041667,13.138889
172,CC(C)C(=O)Nc1ccc([N+](=O)[O-])c(C(F)(F)F)c1,90,2,14.275804,13.128940,0.0,0.0,21.566428,2.416610,4.833220,...,9.798071,52.165203,276.072177,9.202406,706.0,28.0,94.0,106.0,9.729167,4.083333
173,CN1C2CCC1CC(OC(=O)c1cn(C)c3ccccc13)C2,66,1,17.763198,14.701784,0.0,1.0,29.215654,2.478209,4.811903,...,10.166659,72.992174,298.168128,6.776548,1058.0,36.0,124.0,151.0,6.500000,4.666667
