In [49]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [50]:
# training dataset: with LogBB values
dataset = pd.read_csv('dataset.csv')

# test dataset: without logBB values
zinc20 = pd.read_csv('world.csv')

In [51]:
dataset.tail()

Unnamed: 0,NAME,AATS0Z,AATS0are,AATS0d,AATS0dv,AATS0i,AATS0m,AATS0p,AATS0pe,AATS0se,...,piPC10,piPC2,piPC3,piPC4,piPC5,piPC6,piPC7,piPC8,piPC9,logBB
995,Nc1c(Cl)cc(C2=NCCn3nc4cc(C(F)(F)F)ccc4c32)cc1Cl,44.324324,7.125335,4.297297,12.032699,169.77133,184.991187,1.826421,7.249765,8.514856,...,8.285724,4.378897,5.036141,5.691415,6.293535,6.729955,7.222401,7.667646,7.931039,-0.4
996,CCOC(=O)C1=CCCC[C@H]1S(=O)(=O)Nc1ccc(F)cc1Cl,37.2,6.630685,3.325,8.801235,164.440096,151.16971,1.666185,6.721897,8.19374,...,5.557552,4.051785,4.421848,4.874243,5.264954,5.166214,5.345469,5.394905,5.534747,-0.2
997,CCC(=O)Nc1cc(O)nc2ncnn12,27.916667,6.882688,3.541667,11.0,171.071525,110.673908,1.402047,6.893967,8.270913,...,5.645157,3.766997,4.36469,5.010219,5.469904,5.813244,6.045912,6.343791,5.837833,-0.3
998,N#Cc1ccc2c(c1)CCN(CCC1CCC(NC(=O)/C=C/c3cccc4cc...,19.913043,5.826226,3.15942,5.710145,161.180325,78.307565,1.511117,5.919348,7.373935,...,7.000911,4.474492,5.045842,5.574765,6.046337,6.314595,6.612723,6.833861,6.845244,-0.2
999,CCCc1nnnn1-c1ccc(OC)c(CN[C@H]2CCCN[C@H]2c2cccc...,20.266667,5.96249,3.066667,6.066667,165.27819,79.695017,1.423105,6.034012,7.492455,...,6.992491,4.280132,4.847135,5.385928,5.803251,5.995519,6.351049,6.631343,6.848785,-0.17


In [52]:
zinc20.tail()

Unnamed: 0,zinc_id,smiles
5898,ZINC000000057490,Nc1ccc(S(=O)(=O)Nc2ccnn2-c2ccccc2)cc1
5899,ZINC000095618608,C=CC[C@@H]1/C=C(/C)C[C@H](C)C[C@H](OC)[C@H]2O[...
5900,ZINC000334138310,CO[C@H]1C=CO[C@@]2(C)Oc3c(C)c(O)c4c(O)c(c5c(nc...
5901,ZINC000005352878,CC(C)(C)NC[C@H](O)COc1ccc(O)c2c1CCC(O)=N2
5902,ZINC000003871880,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](C(=O)O)c3ccccc...


In [53]:
# function to calculate molecular descriptors
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    descriptors = {}
    descriptors['MolWt'] = Descriptors.MolWt(mol)
    descriptors['MolLogP'] = Descriptors.MolLogP(mol)
    descriptors['TPSA'] = Descriptors.TPSA(mol)
    descriptors['NumHDonors'] = Descriptors.NumHDonors(mol)
    descriptors['NumHAcceptors'] = Descriptors.NumHAcceptors(mol)
    descriptors['NumRotatableBonds'] = Descriptors.NumRotatableBonds(mol)
    return descriptors

In [54]:
# Calculate descriptors for molecules in the test dataset
test_descriptor_list = []
for index, row in zinc20.iterrows():
    descriptors_test = calculate_descriptors(row['smiles'])
    if descriptors_test is not None:
        test_descriptor_list.append(descriptors_test)

# Convert test descriptor list into DataFrame
test_descriptor_df = pd.DataFrame(test_descriptor_list)

In [55]:
# Calculate descriptors for molecules in the training dataset
train_descriptor_list = []
for index, row in dataset.iterrows():
    descriptors_training = calculate_descriptors(row['NAME'])
    if descriptors_training is not None:
        train_descriptor_list.append(descriptors_training)

# Convert training descriptor data into DataFrame
train_descriptor_df = pd.DataFrame(train_descriptor_list)

In [56]:
# target logBB values
if 'logBB' in dataset.columns:
    target_list = dataset['logBB']
else:
    target_list = None
    print('logBB is not available')
    

In [57]:

# Train-test split: features are in train_descriptor_df and the target_list contains logBB values
if target_list is not None:
    X_train, X_test, y_train, y_test = train_test_split(train_descriptor_df, target_list, test_size=0.2, random_state=42)
    
    # Train a Random Forest regressor model
    RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Predict logBB values for the test set
    y_pred = model.predict(test_descriptor_df)
    
    # Create DataFrame with molecule IDs and predicted logBB values
    results_df = pd.DataFrame({'ID': zinc20['zinc_id'], 'Predicted_logBB': y_pred})
    print(results_df)

else:
    print("logBB values are not available in the training dataset. You'll need to ensure you have target values for training.")



                    ID  Predicted_logBB
0     ZINC000004215648        -0.728547
1     ZINC000195282482        -1.034816
2     ZINC000000601249        -0.596541
3     ZINC000001542906        -0.350069
4     ZINC000095618735        -0.858744
...                ...              ...
5898  ZINC000000057490        -0.546094
5899  ZINC000095618608         0.028716
5900  ZINC000334138310        -0.026223
5901  ZINC000005352878        -0.390440
5902  ZINC000003871880        -0.990526

[5903 rows x 2 columns]
