## Checking the accuracy of the models: Random Forest vs. LightGBM

In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

In [15]:
# training dataset
dataset = pd.read_csv('dataset.csv')

# calculate molecular descriptors
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    descriptors = {}
    descriptors['MolWt'] = Descriptors.MolWt(mol)
    descriptors['MolLogP'] = Descriptors.MolLogP(mol)
    descriptors['TPSA'] = Descriptors.TPSA(mol)
    descriptors['NumHDonors'] = Descriptors.NumHDonors(mol)
    descriptors['NumHAcceptors'] = Descriptors.NumHAcceptors(mol)
    descriptors['NumRotatableBonds'] = Descriptors.NumRotatableBonds(mol)
    return descriptors

# Calculate descriptors for molecules in the training dataset
descriptor_list = []
for index, row in dataset.iterrows():
    descriptors = calculate_descriptors(row['NAME'])
    if descriptors is not None:
        descriptor_list.append(descriptors)

# Convert descriptor data into DataFrame
descriptor_df = pd.DataFrame(descriptor_list)

# target logBB values
target_list = dataset['logBB']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(descriptor_df, target_list, test_size=0.2, random_state=42)

# model lightGBM
model = GradientBoostingRegressor(random_state=42)
model.fit(X_train, y_train)

# Prediction of logBB values for the test set
y_pred = model.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error for lightGBM algorithm:", mse)


Mean Squared Error for lightGBM algorithm: 0.4254033992432911


In [16]:

# Load your training dataset
dataset = pd.read_csv('dataset.csv')

# Calculate molecular descriptors
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    descriptors = {}
    descriptors['MolWt'] = Descriptors.MolWt(mol)
    descriptors['MolLogP'] = Descriptors.MolLogP(mol)
    descriptors['TPSA'] = Descriptors.TPSA(mol)
    descriptors['NumHDonors'] = Descriptors.NumHDonors(mol)
    descriptors['NumHAcceptors'] = Descriptors.NumHAcceptors(mol)
    descriptors['NumRotatableBonds'] = Descriptors.NumRotatableBonds(mol)
    return descriptors

# Calculate descriptors for molecules in the training dataset
descriptor_list = []
for index, row in dataset.iterrows():
    descriptors = calculate_descriptors(row['NAME'])
    if descriptors is not None:
        descriptor_list.append(descriptors)

# Convert descriptor data into DataFrame
descriptor_df = pd.DataFrame(descriptor_list)

# target logBB values
target_list = dataset['logBB']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(descriptor_df, target_list, test_size=0.2, random_state=42)

#model random forest
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict logBB values for the test set
y_pred = model.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error for random forest algorithm:", mse)


Mean Squared Error for random forest algorithm: 0.40871410937508934


### MSE measures the average squared differences between the actual and predicted values. Therefore the lower the better. So, for the prediction of LogBB of the Zinc20 dataset the random forest algoritm, with the MSE value 0,40, is used. 