In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from rdkit import Chem

from rdkit import Chem
from rdkit.Chem import Descriptors, Draw

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import roc_curve, roc_auc_score, RocCurveDisplay, mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import GridSearchCV

In [None]:
dataset_chemical_physical = pd.read_csv('datasets/pubchem_chemical_and_physical_properties.csv', low_memory=False)

In [None]:
dataset_chemical_physical_for_work = dataset_chemical_physical[['cmpdname', 'smiles', 'mw', 'xlogp', 'hbonddonor', 'hbondacc', 'polararea', 'rotbonds']]

In [None]:
violations_lipinski = []

for row in range(0, len(dataset_chemical_physical_for_work)):

    nviolations = 0

    if float(dataset_chemical_physical_for_work.iloc[row, 2]) > 500:
        nviolations += 1
    if float(dataset_chemical_physical_for_work.iloc[row, 3]) > 5:
        nviolations += 1
    if float(dataset_chemical_physical_for_work.iloc[row, 3]) < 0:
        nviolations += 1
    if float(dataset_chemical_physical_for_work.iloc[row, 4]) > 5:
        nviolations += 1
    if float(dataset_chemical_physical_for_work.iloc[row, 5]) > 10:
        nviolations += 1

    violations_lipinski.append(nviolations)

In [None]:
violations_bro = []

for row in range(0, len(dataset_chemical_physical_for_work)):

    nviolations = 0

    if float(dataset_chemical_physical_for_work.iloc[row, 2]) > 1000:
        nviolations += 1
    if float(dataset_chemical_physical_for_work.iloc[row, 3]) > 10:
        nviolations += 1
    if float(dataset_chemical_physical_for_work.iloc[row, 3]) < -2:
        nviolations += 1
    if float(dataset_chemical_physical_for_work.iloc[row, 4]) > 6:
        nviolations += 1
    if float(dataset_chemical_physical_for_work.iloc[row, 5]) > 15:
        nviolations += 1
    if float(dataset_chemical_physical_for_work.iloc[row, 6]) > 250:
        nviolations += 1
    if float(dataset_chemical_physical_for_work.iloc[row, 7]) > 20:
        nviolations += 1

    violations_bro.append(nviolations)

In [None]:
violations_muegge = []

for row in range(0, len(dataset_chemical_physical_for_work)):

    nviolations = 0

    if float(dataset_chemical_physical_for_work.iloc[row, 2]) > 600:
        nviolations += 1
    if float(dataset_chemical_physical_for_work.iloc[row, 2]) < 200:
        nviolations += 1
    if float(dataset_chemical_physical_for_work.iloc[row, 3]) > 5:
        nviolations += 1
    if float(dataset_chemical_physical_for_work.iloc[row, 3]) < -2:
        nviolations += 1
    if float(dataset_chemical_physical_for_work.iloc[row, 4]) > 5:
        nviolations += 1
    if float(dataset_chemical_physical_for_work.iloc[row, 5]) > 10:
        nviolations += 1
    if float(dataset_chemical_physical_for_work.iloc[row, 6]) > 150:
        nviolations += 1
    if float(dataset_chemical_physical_for_work.iloc[row, 7]) > 15:
        nviolations += 1

    count_carbon = 0
    count_heteroatoms = 0
    count_rings = 0

    for symbol in dataset_chemical_physical_for_work.iloc[row, 1]:

        heteroatoms_check = ['C', 'c', 'H', 'h', '[', ']', '(', ')', '+', '-', '=', '#']

        if symbol == 'C':
            count_carbon += 1
        elif symbol not in heteroatoms_check:
            count_heteroatoms += 1
        elif symbol.isdigit():
            count_rings = int(symbol)

        if symbol == 'l':
            count_carbon -= 1
        # elif symbol == 'r':
        #     count_carbon -= 1

    if count_carbon < 5:
        nviolations += 1
    if count_heteroatoms < 2:
        nviolations += 1
    if count_rings > 7:
        nviolations += 1


    violations_muegge.append(nviolations)

In [None]:
violations_lipinski_dict = {'nviolations_lipinski_rule': violations_lipinski}
violations_bro_dict = {'nviolations_bro_rule': violations_bro}
violations_muegge_dict = {'nviolations_muegge_rule': violations_muegge}

In [None]:
dataset_with_nviolations_con = pd.concat([dataset_chemical_physical_for_work, pd.DataFrame(violations_lipinski_dict), pd.DataFrame(violations_bro_dict), pd.DataFrame(violations_muegge_dict)], axis=1)

In [None]:
dataset_with_nviolations = dataset_with_nviolations_con.dropna()

In [None]:
list_of_smiles = [smile for smile in dataset_with_nviolations['smiles']]

In [None]:
molecule_ml1 = Chem.MolFromSmiles('N[C@@H](CCCNC(=N)N)C(=O)NCC(=O)N[C@@H](CC(=O)O)C(=O)O')
molecule_ml2 = Chem.MolFromSmiles('NC(CCNC(=N)N)C(=O)NCC(=O)N[C@@H](CC(=O)O)C(=O)O')
molecule_ml3 = Chem.MolFromSmiles('NC(CNC(=N)N)C(=O)NCC(=O)N[C@@H](CC(=O)O)C(=O)O')
molecule_ml4 = Chem.MolFromSmiles('N[C@@H](CCONC(=N)N)C(=O)OCC(=O)N[C@@H](CC(=O)O)C(=O)O')
molecule_ml5 = Chem.MolFromSmiles('NCCNC(=O)[C@H](CC(=O)O)NC(=O)CNC(=O)[C@@H](N)CCCNC(=N)N')
molecule_ml6 = Chem.MolFromSmiles('NCCCNC(=O)[C@H](CC(=O)O)NC(=O)CNC(=O)[C@@H](N)CCCNC(=N)N')
molecule_ml7 = Chem.MolFromSmiles('NCCNC(=O)[C@H](CC(=O)O)NC(=O)CNC(=O)C(N)CCNC(=N)N')
molecule_ml8 = Chem.MolFromSmiles('NCCCNC(=O)[C@H](CC(=O)O)NC(=O)CNC(=O)C(N)CCNC(=N)N')
molecule_ml9 = Chem.MolFromSmiles('NCCNC(=O)[C@H](CC(=O)O)NC(=O)CNC(=O)C(N)CNC(=N)N')
molecule_ml10 = Chem.MolFromSmiles('NCCCNC(=O)[C@H](CC(=O)O)NC(=O)CNC(=O)C(N)CNC(=N)N')
molecule_ml11 = Chem.MolFromSmiles('NCCNC(=O)[C@H](CC(=O)O)NC(=O)COC(=O)[C@@H](N)CCONC(=N)N')
molecule_ml12 = Chem.MolFromSmiles('NCCCNC(=O)[C@H](CC(=O)O)NC(=O)COC(=O)[C@@H](N)CCONC(=N)N')
molecule_ml13 = Chem.MolFromSmiles('CC[C@H](C)[C@@H](NC(=O)[C@@H]1CCCN1C(=O)[C@@H](NC(=O)[C@H](C)N)C(C)C)C(=O)O')
molecule_ml14 = Chem.MolFromSmiles('CC[C@H](C)[C@@H](NC(=O)[C@@H]1CCCN1C(=O)[C@H](Cc2ccc(F)cc2)NC(=O)[C@H](C)N)C(=O)O')
molecule_ml15 = Chem.MolFromSmiles('C[C@H](N)C(=O)N[C@@H](Cc1ccc(F)cc1)C(=O)N2CCC[C@H]2C(=O)N[C@H](Cc3ccc(F)cc3)C(=O)O')
molecule_ml16 = Chem.MolFromSmiles('CC(C)[C@H](NC(=O)[C@H](C)N)C(=O)N1CCC[C@H]1C(=O)N[C@H](Cc2ccc(F)cc2)C(=O)O')
molecule_ml17 = Chem.MolFromSmiles('CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)C2CCCN2C(=O)[C@H](CCCNC(=N)N)NC(=O)[C@@H](N)CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)O')
molecule_ml18 = Chem.MolFromSmiles('CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)C2CCCN2C(=O)[C@H](CCONC(=N)N)NC(=O)[C@@H](N)CCCCN)C(=O)N[C@@H](CC(C)C)C(=O)O')
molecule_ml19 = Chem.MolFromSmiles('CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)C2CCCN2C(=O)[C@H](CCCCN)NC(=O)[C@@H](N)CCONC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)O')
molecule_ml20 = Chem.MolFromSmiles('CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)C2CCCN2C(=O)[C@H](CCONC(=N)N)NC(=O)[C@@H](N)CCONC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)O')
molecule_ml21 = Chem.MolFromSmiles('CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)C2CCCN2C(=O)[C@H](CCONC(=N)N)NC(=O)[C@@H](N)CCCNC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)O')
molecule_ml22 = Chem.MolFromSmiles('CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)C2CCCN2C(=O)[C@H](CCCNC(=N)N)NC(=O)[C@@H](N)CCONC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)O')
molecule_ml23 = Chem.MolFromSmiles('CC[C@@H](C)[C@H](NC(=O)[C@H](Cc1ccc(F)cc1)NC(=O)C2CCCN2C(=O)[C@H](CCCNC(=N)N)NC(=O)[C@@H](N)CCONC(=N)N)C(=O)N[C@@H](CC(C)C)C(=O)O')
molecule_ml24 = Chem.MolFromSmiles('CCC(C)[C@H](NC(=O)[C@@H]1CCCN1C(=O)[C@@H](NC(=O)[C@H](C)N)C(C)C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)O')
molecule_ml25 = Chem.MolFromSmiles('CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)C2CCCN2C(=O)[C@H](CCONC(=N)N)NC(=O)[C@H](CCONC(=N)N)NC(=O)[C@@H](NC(=O)[C@@H]3CCCN3C(=O)[C@@H](NC(=O)[C@H](C)N)C(C)C)C(C)CC)C(=O)N[C@@H](CC(C)C)C(=O)O')
molecule_ml26 = Chem.MolFromSmiles('CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)C2CCCN2C(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CCCNC(=N)N)NC(=O)[C@@H](NC(=O)[C@@H]3CCCN3C(=O)[C@@H](NC(=O)[C@H](C)N)C(C)C)C(C)CC)C(=O)N[C@@H](CC(C)C)C(=O)O')

In [None]:
descriptors_ml1 = Descriptors.CalcMolDescriptors(molecule_ml1)
descriptors_ml2 = Descriptors.CalcMolDescriptors(molecule_ml2)
descriptors_ml3 = Descriptors.CalcMolDescriptors(molecule_ml3)
descriptors_ml4 = Descriptors.CalcMolDescriptors(molecule_ml4)
descriptors_ml5 = Descriptors.CalcMolDescriptors(molecule_ml5)
descriptors_ml6 = Descriptors.CalcMolDescriptors(molecule_ml6)
descriptors_ml7 = Descriptors.CalcMolDescriptors(molecule_ml7)
descriptors_ml8 = Descriptors.CalcMolDescriptors(molecule_ml8)
descriptors_ml9 = Descriptors.CalcMolDescriptors(molecule_ml9)
descriptors_ml10 = Descriptors.CalcMolDescriptors(molecule_ml10)
descriptors_ml11 = Descriptors.CalcMolDescriptors(molecule_ml11)
descriptors_ml12 = Descriptors.CalcMolDescriptors(molecule_ml12)
descriptors_ml13 = Descriptors.CalcMolDescriptors(molecule_ml13)
descriptors_ml14 = Descriptors.CalcMolDescriptors(molecule_ml14)
descriptors_ml15 = Descriptors.CalcMolDescriptors(molecule_ml15)
descriptors_ml16 = Descriptors.CalcMolDescriptors(molecule_ml16)
descriptors_ml17 = Descriptors.CalcMolDescriptors(molecule_ml17)
descriptors_ml18 = Descriptors.CalcMolDescriptors(molecule_ml18)
descriptors_ml19 = Descriptors.CalcMolDescriptors(molecule_ml19)
descriptors_ml20 = Descriptors.CalcMolDescriptors(molecule_ml20)
descriptors_ml21 = Descriptors.CalcMolDescriptors(molecule_ml21)
descriptors_ml22 = Descriptors.CalcMolDescriptors(molecule_ml22)
descriptors_ml23 = Descriptors.CalcMolDescriptors(molecule_ml23)
descriptors_ml24 = Descriptors.CalcMolDescriptors(molecule_ml24)
descriptors_ml25 = Descriptors.CalcMolDescriptors(molecule_ml25)
descriptors_ml26 = Descriptors.CalcMolDescriptors(molecule_ml26)

In [None]:
molecular_descriptors_test_peptides = {
    'MolWt': [],
    'NHOHCount': [],
    'NOCount': [],
    'NumHAcceptors': [],
    'NumHDonors': [],
    'NumRotatableBonds': [],
    'MolLogP': [],
    'TPSA': []
}

def molecular_descriptors_calculation_peptides(descriptors_dict):
    for descriptor, value in descriptors_dict.items():
        if descriptor == 'MolWt':
            molecular_descriptors_test_peptides['MolWt'].append(value)
        elif descriptor == 'NHOHCount':
            molecular_descriptors_test_peptides['NHOHCount'].append(value)
        elif descriptor == 'NOCount':
            molecular_descriptors_test_peptides['NOCount'].append(value)
        elif descriptor == 'NumHAcceptors':
            molecular_descriptors_test_peptides['NumHAcceptors'].append(value)
        elif descriptor == 'NumHDonors':
            molecular_descriptors_test_peptides['NumHDonors'].append(value)
        elif descriptor == 'NumRotatableBonds':
            molecular_descriptors_test_peptides['NumRotatableBonds'].append(value)
        elif descriptor == 'MolLogP':
            molecular_descriptors_test_peptides['MolLogP'].append(value)
        elif descriptor == 'TPSA':
            molecular_descriptors_test_peptides['TPSA'].append(value)

In [None]:
molecular_descriptors_calculation_peptides(descriptors_ml1)
molecular_descriptors_calculation_peptides(descriptors_ml2)
molecular_descriptors_calculation_peptides(descriptors_ml3)
molecular_descriptors_calculation_peptides(descriptors_ml4)
molecular_descriptors_calculation_peptides(descriptors_ml5)
molecular_descriptors_calculation_peptides(descriptors_ml6)
molecular_descriptors_calculation_peptides(descriptors_ml7)
molecular_descriptors_calculation_peptides(descriptors_ml8)
molecular_descriptors_calculation_peptides(descriptors_ml9)
molecular_descriptors_calculation_peptides(descriptors_ml10)
molecular_descriptors_calculation_peptides(descriptors_ml11)
molecular_descriptors_calculation_peptides(descriptors_ml12)
molecular_descriptors_calculation_peptides(descriptors_ml13)
molecular_descriptors_calculation_peptides(descriptors_ml14)
molecular_descriptors_calculation_peptides(descriptors_ml15)
molecular_descriptors_calculation_peptides(descriptors_ml16)
molecular_descriptors_calculation_peptides(descriptors_ml17)
molecular_descriptors_calculation_peptides(descriptors_ml18)
molecular_descriptors_calculation_peptides(descriptors_ml19)
molecular_descriptors_calculation_peptides(descriptors_ml20)
molecular_descriptors_calculation_peptides(descriptors_ml21)
molecular_descriptors_calculation_peptides(descriptors_ml22)
molecular_descriptors_calculation_peptides(descriptors_ml23)
molecular_descriptors_calculation_peptides(descriptors_ml24)
molecular_descriptors_calculation_peptides(descriptors_ml25)
molecular_descriptors_calculation_peptides(descriptors_ml26)

In [None]:
test_dataset = pd.DataFrame.from_dict(molecular_descriptors_test_peptides, orient='index').T

In [None]:
test_dataset = test_dataset.rename(columns={'MolWt': 'mw', 'MolLogP': 'xlogp', 'NumHDonors': 'hbonddonor', 'NumHAcceptors': 'hbondacc', 'TPSA': 'polararea', 'NumRotatableBonds': 'rotbonds'})
test_dataset_for_work = test_dataset[['mw', 'xlogp', 'hbonddonor', 'hbondacc', 'polararea', 'rotbonds']]
test_dataset_for_work_lipinski = test_dataset[['mw', 'xlogp', 'hbonddonor', 'hbondacc']]

In [None]:
peptide_names = ['ML1', 'ML2', 'ML3', 'ML4', 'ML5', 'ML6', 'ML7', 'ML8', 'ML9', 'ML10', 'ML11', 'ML12', 'ML13', 'ML14', 'ML15', 'ML16', 'ML17', 'ML18', 'ML19', 'ML20', 'ML21', 'ML22', 'ML23', 'ML24', 'ML25', 'ML26']

In [None]:
tested_molecules = {'tested_molecules': peptide_names}

In [None]:
attributes_lipinski_train, attributes_lipinski_test, target_lipinski_train, target_lipinski_test = train_test_split(dataset_with_nviolations[['mw', 'xlogp', 'hbonddonor', 'hbondacc']], dataset_with_nviolations[['nviolations_bro_rule']], test_size=0.2, random_state=42)

In [None]:
attributes_bro_train, attributes_bro_test, target_bro_train, target_bro_test = train_test_split(dataset_with_nviolations[['mw', 'xlogp', 'hbonddonor', 'hbondacc', 'polararea', 'rotbonds']], dataset_with_nviolations['nviolations_bro_rule'], test_size=0.2, random_state=42)

In [None]:
attributes_muegge_train, attributes_muegge_test, target_muegge_train, target_muegge_test = train_test_split(dataset_with_nviolations[['mw', 'xlogp', 'hbonddonor', 'hbondacc', 'polararea', 'rotbonds']], dataset_with_nviolations['nviolations_muegge_rule'], test_size=0.2, random_state=42)


In [None]:
rf_lipinski = RandomForestRegressor()

param_grid = {
    'n_estimators': [10, 20, 30],
    'max_depth': [None, 10, 20],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 4],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(estimator=rf_lipinski, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=1, verbose=2)

grid_search.fit(attributes_lipinski_train, target_lipinski_train)

print("Best parameters:", grid_search.best_params_)
print("Best Value MSE:", grid_search.best_score_)

In [None]:
rf_lipinski = RandomForestRegressor()

param_grid = {
    'n_estimators': [10, 20, 30],
    'max_depth': [None, 10, 20],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 4],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(estimator=rf_lipinski, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=1, verbose=2)

grid_search.fit(attributes_lipinski_train, target_lipinski_train)

print("Best parameters:", grid_search.best_params_)
print("Best Value MAE:", grid_search.best_score_)

In [None]:
rf_lipinski = RandomForestRegressor()

param_grid = {
    'n_estimators': [10, 20, 30],
    'max_depth': [None, 10, 20],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 4],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(estimator=rf_lipinski, param_grid=param_grid, cv=5, scoring='r2', n_jobs=1, verbose=2)

grid_search.fit(attributes_lipinski_train, target_lipinski_train)

print("Best parameters:", grid_search.best_params_)
print("Best Value R2:", grid_search.best_score_)

In [None]:
rf_muegge = RandomForestRegressor()

param_grid = {
    'n_estimators': [10, 20, 30],
    'max_depth': [None, 10, 20],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 4],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(estimator=rf_muegge, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=1, verbose=2)

grid_search.fit(attributes_muegge_train, target_muegge_train)

print("Best parameters:", grid_search.best_params_)
print("Best Value MSE:", grid_search.best_score_)

In [None]:
rf_muegge = RandomForestRegressor()

param_grid = {
    'n_estimators': [10, 20, 30],
    'max_depth': [None, 10, 20],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 4],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(estimator=rf_muegge, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=1, verbose=2)

grid_search.fit(attributes_muegge_train, target_muegge_train)

print("Best parameters:", grid_search.best_params_)
print("Best Value MAE:", grid_search.best_score_)

In [None]:
rf_muegge = RandomForestRegressor()

param_grid = {
    'n_estimators': [10, 20, 30],
    'max_depth': [None, 10, 20],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 4],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(estimator=rf_muegge, param_grid=param_grid, cv=5, scoring='r2', n_jobs=1, verbose=2)

grid_search.fit(attributes_muegge_train, target_muegge_train)

print("Best parameters:", grid_search.best_params_)
print("Best Value R2:", grid_search.best_score_)

In [None]:
rf_bro = RandomForestRegressor()

param_grid = {
    'n_estimators': [10, 20, 30],
    'max_depth': [None, 10, 20],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 4],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(estimator=rf_bro, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=1, verbose=2)

grid_search.fit(attributes_bro_train, target_bro_train)

print("Best parameters:", grid_search.best_params_)
print("Best Value MSE:", grid_search.best_score_)

In [None]:
rf_bro = RandomForestRegressor()

param_grid = {
    'n_estimators': [10, 20, 30],
    'max_depth': [None, 10, 20],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 4],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(estimator=rf_bro, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=1, verbose=2)

grid_search.fit(attributes_bro_train, target_bro_train)

print("Best parameters:", grid_search.best_params_)
print("Best Value MAE:", grid_search.best_score_)

In [None]:
rf_bro = RandomForestRegressor()

param_grid = {
    'n_estimators': [10, 20, 30],
    'max_depth': [None, 10, 20],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 4],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(estimator=rf_bro, param_grid=param_grid, cv=5, scoring='r2', n_jobs=1, verbose=2)

grid_search.fit(attributes_bro_train, target_bro_train)

print("Best parameters:", grid_search.best_params_)
print("Best Value R2:", grid_search.best_score_)

In [None]:
model_lipinski_10 = RandomForestRegressor(n_estimators=10)

In [None]:
model_lipinski_20 = RandomForestRegressor(n_estimators=20)

In [None]:
model_lipinski_30 = RandomForestRegressor(n_estimators=30)

In [None]:
model_bro_10 = RandomForestRegressor(n_estimators=10)

In [None]:
model_bro_20 = RandomForestRegressor(n_estimators=20)

In [None]:
model_bro_30 = RandomForestRegressor(n_estimators=30)

In [None]:
model_muegge_10 = RandomForestRegressor(n_estimators=10)
model_muegge_20 = RandomForestRegressor(n_estimators=20)
model_muegge_30 = RandomForestRegressor(n_estimators=30)

In [None]:
model_lipinski_10.fit(attributes_lipinski_train, target_lipinski_train)

In [None]:
model_lipinski_20.fit(attributes_lipinski_train, target_lipinski_train)

In [None]:
model_lipinski_30.fit(attributes_lipinski_train, target_lipinski_train)

In [None]:
model_bro_10.fit(attributes_bro_train, target_bro_train)

In [None]:
model_bro_20.fit(attributes_bro_train, target_bro_train)

In [None]:
model_bro_30.fit(attributes_bro_train, target_bro_train)

In [None]:
model_muegge_10.fit(attributes_muegge_train, target_muegge_train)

In [None]:
model_muegge_20.fit(attributes_muegge_train, target_muegge_train)

In [None]:
model_muegge_30.fit(attributes_muegge_train, target_muegge_train)

In [None]:
test_prediction_lipinski_10 = model_lipinski_10.predict(attributes_lipinski_test)
test_prediction_lipinski_20 = model_lipinski_20.predict(attributes_lipinski_test)
test_prediction_lipinski_30 = model_lipinski_30.predict(attributes_lipinski_test)

In [None]:
test_prediction_bro_10 = model_bro_10.predict(attributes_bro_test)
test_prediction_bro_20 = model_bro_20.predict(attributes_bro_test)
test_prediction_bro_30 = model_bro_30.predict(attributes_bro_test)

In [None]:
test_predicton_muegge_10 = model_muegge_10.predict(attributes_muegge_test)
test_predicton_muegge_20 = model_muegge_20.predict(attributes_muegge_test)
test_predicton_muegge_30 = model_muegge_30.predict(attributes_muegge_test)

In [None]:
plt.scatter(target_lipinski_test, test_prediction_lipinski_10, alpha=0.5, color='black')
plt.xlabel("Test Values")
plt.ylabel("Predicted Values")
plt.title("Scatter plot: Random Forest Regressor (Lipinski 10 Trees)")
plt.grid(True)
plt.show()

In [None]:
plt.scatter(target_lipinski_test, test_prediction_lipinski_20, alpha=0.5, color='black')
plt.xlabel("Test Values")
plt.ylabel("Predicted Values")
plt.title("Scatter plot: Random Forest Regressor (Lipinski 20 Trees)")
plt.grid(True)
plt.show()

In [None]:
plt.scatter(target_lipinski_test, test_prediction_lipinski_30, alpha=0.5, color='black')
plt.xlabel("Test Values")
plt.ylabel("Predicted Values")
plt.title("Scatter plot: Random Forest Regressor (Lipinski 30 Trees)")
plt.grid(True)
plt.show()

In [None]:
plt.scatter(target_muegge_test, test_predicton_muegge_10, alpha=0.5, color='black')
plt.xlabel("Test Values")
plt.ylabel("Predicted Values")
plt.title("Scatter plot: Random Forest Regressor (Muegge 10 Trees)")
plt.grid(True)
plt.show()

In [None]:
plt.scatter(target_muegge_test, test_predicton_muegge_20, alpha=0.5, color='black')
plt.xlabel("Test Values")
plt.ylabel("Predicted Values")
plt.title("Scatter plot: Random Forest Regressor (Muegge 20 Trees)")
plt.grid(True)
plt.show()

In [None]:
plt.scatter(target_muegge_test, test_predicton_muegge_30, alpha=0.5, color='black')
plt.xlabel("Test Values")
plt.ylabel("Predicted Values")
plt.title("Scatter plot: Random Forest Regressor (Muegge 30 Trees)")
plt.grid(True)
plt.show()

In [None]:
plt.scatter(target_bro_test, test_prediction_bro_10, alpha=0.5, color='black')
plt.xlabel("Test Values")
plt.ylabel("Predicted Values")
plt.title("Scatter plot: Random Forest Regressor (bRo5 10 Trees)")
plt.grid(True)
plt.show()

In [None]:
plt.scatter(target_bro_test, test_prediction_bro_20, alpha=0.5, color='black')
plt.xlabel("Test Values")
plt.ylabel("Predicted Values")
plt.title("Scatter plot: Random Forest Regressor (bRo5 20 Trees)")
plt.grid(True)
plt.show()

In [None]:
plt.scatter(target_bro_test, test_prediction_bro_30, alpha=0.5, color='black')
plt.xlabel("Test Values")
plt.ylabel("Predicted Values")
plt.title("Scatter plot: Random Forest Regressor (bRo5 30 Trees)")
plt.grid(True)
plt.show()

In [None]:
mse_lipinski_10 = mean_squared_error(target_lipinski_test, test_prediction_lipinski_10)
mse_lipinski_20 = mean_squared_error(target_lipinski_test, test_prediction_lipinski_20)
mse_lipinski_30 = mean_squared_error(target_lipinski_test, test_prediction_lipinski_30)

In [None]:
print("MSE Lipinski 10:", mse_lipinski_10)
print("MSE Lipinski 20:", mse_lipinski_20)
print("MSE Lipinski 30:", mse_lipinski_30)

In [None]:
mse_bro_10 = mean_squared_error(target_bro_test, test_prediction_bro_10)
mse_bro_20 = mean_squared_error(target_bro_test, test_prediction_bro_20)
mse_bro_30 = mean_squared_error(target_bro_test, test_prediction_bro_30)

In [None]:
print("MSE Bro 10:", mse_bro_10)
print("MSE Bro 20:", mse_bro_20)
print("MSE Bro 30:", mse_bro_30)

In [None]:
mse_muegge_10 = mean_squared_error(target_muegge_test, test_predicton_muegge_10)
mse_muegge_20 = mean_squared_error(target_muegge_test, test_predicton_muegge_20)
mse_muegge_30 = mean_squared_error(target_muegge_test, test_predicton_muegge_30)

In [None]:
print("MSE Muegge 10:", mse_muegge_10)
print("MSE Muegge 20:", mse_muegge_20)
print("MSE Muegge 30:", mse_muegge_30)

In [None]:
mae_lipinski_10 = mean_absolute_error(target_lipinski_test, test_prediction_lipinski_10)
mae_lipinski_20 = mean_absolute_error(target_lipinski_test, test_prediction_lipinski_20)
mae_lipinski_30 = mean_absolute_error(target_lipinski_test, test_prediction_lipinski_30)

In [None]:
mae_bro_10 = mean_absolute_error(target_bro_test, test_prediction_bro_10)
mae_bro_20 = mean_absolute_error(target_bro_test, test_prediction_bro_20)
mae_bro_30 = mean_absolute_error(target_bro_test, test_prediction_bro_30)

In [None]:
mae_muegge_10 = mean_absolute_error(target_muegge_test, test_predicton_muegge_10)
mae_muegge_20 = mean_absolute_error(target_muegge_test, test_predicton_muegge_20)
mae_muegge_30 = mean_absolute_error(target_muegge_test, test_predicton_muegge_30)

In [None]:
print("MAE Lipinski 10:", mae_lipinski_10)
print("MAE Lipinski 20:", mae_lipinski_20)
print("MAE Lipinski 30:", mae_lipinski_30)

In [None]:
print("MAE Bro 10:", mae_bro_10)
print("MAE Bro 20:", mae_bro_20)
print("MAE Bro 30:", mae_bro_30)

In [None]:
print("MAE Muegge 10:", mae_muegge_10)
print("MAE Muegge 20:", mae_muegge_20)
print("MAE Muegge 30:", mae_muegge_30)

In [None]:
r2_lipinski_10 = r2_score(target_lipinski_test, test_prediction_lipinski_10)
r2_lipinski_20 = r2_score(target_lipinski_test, test_prediction_lipinski_20)
r2_lipinski_30 = r2_score(target_lipinski_test, test_prediction_lipinski_30)

In [None]:
r2_bro_10 = r2_score(target_bro_test, test_prediction_bro_10)
r2_bro_20 = r2_score(target_bro_test, test_prediction_bro_20)
r2_bro_30 = r2_score(target_bro_test, test_prediction_bro_30)

In [None]:
r2_muegge_10 = r2_score(target_muegge_test, test_predicton_muegge_10)
r2_muegge_20 = r2_score(target_muegge_test, test_predicton_muegge_20)
r2_muegge_30 = r2_score(target_muegge_test, test_predicton_muegge_30)

In [None]:
print("R² score Lipinski 10:", r2_lipinski_10)
print("R² score Lipinski 20:", r2_lipinski_20)
print("R² score Lipinski 30:", r2_lipinski_30)

In [None]:
print("R² score Bro 10:", r2_bro_10)
print("R² score Bro 20:", r2_bro_20)
print("R² score Bro 30:", r2_bro_30)

In [None]:
print("R² score Muegge 10:", r2_muegge_10)
print("R² score Muegge 20:", r2_muegge_20)
print("R² score Muegge 30:", r2_muegge_30)

In [None]:
prediction_lipinski_10 = model_lipinski_10.predict(test_dataset_for_work_lipinski)

In [None]:
prediction_lipinski_20 = model_lipinski_20.predict(test_dataset_for_work_lipinski)

In [None]:
prediction_lipinski_30 = model_lipinski_30.predict(test_dataset_for_work_lipinski)

In [None]:
prediction_bro_10 = model_bro_10.predict(test_dataset_for_work)

In [None]:
prediction_bro_20 = model_bro_20.predict(test_dataset_for_work)

In [None]:
prediction_bro_30 = model_bro_30.predict(test_dataset_for_work)

In [None]:
prediction_muegge_10 = model_muegge_10.predict(test_dataset_for_work)

In [None]:
prediction_muegge_20 = model_muegge_10.predict(test_dataset_for_work)

In [None]:
prediction_muegge_30 = model_muegge_10.predict(test_dataset_for_work)

In [None]:
predicted_violations_peptides_lipinski_10 = []

for predicted_value in prediction_lipinski_10:
    predicted_violations_peptides_lipinski_10.append(int(predicted_value))

In [None]:
predicted_violations_peptides_lipinski_20 = []

for predicted_value in prediction_lipinski_20:
    predicted_violations_peptides_lipinski_20.append(int(predicted_value))

In [None]:
predicted_violations_peptides_lipinski_30 = []

for predicted_value in prediction_lipinski_30:
    predicted_violations_peptides_lipinski_30.append(int(predicted_value))

In [None]:
predicted_violations_peptides_bro_10 = []

for predicted_value in prediction_bro_10:
    predicted_violations_peptides_bro_10.append(int(predicted_value))

In [None]:
predicted_violations_peptides_bro_20 = []

for predicted_value in prediction_bro_20:
    predicted_violations_peptides_bro_20.append(int(predicted_value))

In [None]:
predicted_violations_peptides_bro_30 = []

for predicted_value in prediction_bro_30:
    predicted_violations_peptides_bro_30.append(int(predicted_value))

In [None]:
predicted_violations_peptides_muegge_10 = []

for predicted_value in prediction_muegge_10:
    predicted_violations_peptides_muegge_10.append(int(predicted_value))

In [None]:
predicted_violations_peptides_muegge_20 = []

for predicted_value in prediction_muegge_20:
    predicted_violations_peptides_muegge_20.append(int(predicted_value))

In [None]:
predicted_violations_peptides_muegge_30 = []

for predicted_value in prediction_muegge_30:
    predicted_violations_peptides_muegge_30.append(int(predicted_value))

In [None]:
lipinski_violations_dict_10 = {'nviolations_lipinski_rule_10': predicted_violations_peptides_lipinski_10}
lipinski_violations_dict_20 = {'nviolations_lipinski_rule_20': predicted_violations_peptides_lipinski_20}
lipinski_violations_dict_30 = {'nviolations_lipinski_rule_30': predicted_violations_peptides_lipinski_30}

In [None]:
bro_violations_dict_10 = {'nviolations_bro_rule_10': predicted_violations_peptides_bro_10}
bro_violations_dict_20 = {'nviolations_bro_rule_20': predicted_violations_peptides_bro_20}
bro_violations_dict_30 = {'nviolations_bro_rule_30': predicted_violations_peptides_bro_30}

In [None]:
muegge_violations_10_dict = {'nviolations_muegge_rule_10': predicted_violations_peptides_muegge_10}
muegge_violations_20_dict = {'nviolations_muegge_rule_20': predicted_violations_peptides_muegge_20}
muegge_violations_30_dict = {'nviolations_muegge_rule_30': predicted_violations_peptides_muegge_30}

In [None]:
peptide_predictions_dataset_lipinski = pd.concat([pd.DataFrame(tested_molecules), test_dataset_for_work_lipinski, pd.DataFrame(lipinski_violations_dict_10), pd.DataFrame(lipinski_violations_dict_20), pd.DataFrame(lipinski_violations_dict_30)], axis=1)

In [None]:
peptide_predictions_dataset_bro = pd.concat([pd.DataFrame(tested_molecules), test_dataset_for_work, pd.DataFrame(bro_violations_dict_10), pd.DataFrame(bro_violations_dict_20), pd.DataFrame(bro_violations_dict_30)], axis=1)

In [None]:
peptide_predictions_muegge_dataset = pd.concat([pd.DataFrame(tested_molecules), test_dataset_for_work, pd.DataFrame(muegge_violations_10_dict), pd.DataFrame(muegge_violations_20_dict), pd.DataFrame(muegge_violations_30_dict)], axis=1)

In [None]:
peptide_predictions_dataset_lipinski

In [None]:
peptide_predictions_dataset_bro

In [None]:
peptide_predictions_muegge_dataset

In [None]:
swissadme_peptides = pd.read_csv('datasets/swissadme_peptides.csv')

In [None]:
comparison_table_peptides_lipinski = pd.concat([peptide_predictions_dataset_lipinski[['tested_molecules', 'nviolations_lipinski_rule_10', 'nviolations_lipinski_rule_20', 'nviolations_lipinski_rule_30']], swissadme_peptides[['Lipinski #violations']]], axis=1)

In [None]:
comparison_table_peptides_lipinski

In [None]:
comparison_table_peptides_lipinski

In [None]:
comparison_table_peptides_muegge = pd.concat([peptide_predictions_muegge_dataset[['tested_molecules','nviolations_muegge_rule_10', 'nviolations_muegge_rule_20', 'nviolations_muegge_rule_30']], swissadme_peptides[['Muegge #violations']]], axis=1)

In [None]:
comparison_table_peptides_muegge