## Traditional machine learning
Feature(s): 'SMILES' column

- 'SMILES' column

Target: E_bin

- Binary class

In [1]:
import sys
import os
from sklearn.model_selection import cross_val_score
from sklearn.base import clone
from loguru import logger

logger.remove()
logger.add(sys.stderr, level="WARNING")
# Append the parent directory of your package to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..', '..')))

In [2]:
import pandas as pd
import numpy as np
import zipfile

path_to_dataset = 'train_denis.csv'
csv_filename = 'train_denis.csv'

# Open the file, Correct the encoding and sep if necessary
if path_to_dataset.endswith('.zip'):
    with zipfile.ZipFile(path_to_dataset, 'r') as z:
        # Open the CSV file within the ZIP file
        with z.open(csv_filename) as f:
            # Read the CSV file into a DataFrame
            df = pd.read_csv(f, sep=',', on_bad_lines='warn', index_col = 0)
else:
    # Read the CSV file into a DataFrame
    df = pd.read_csv(path_to_dataset, sep=',', on_bad_lines='warn', index_col = 0)


print('Count of unique smiles:', df.SMILES.unique().shape[0])
print('Count of all of the smiles:', df.shape[0])


Count of unique smiles: 966
Count of all of the smiles: 966


In [3]:
from MLPipeline import MLmodel, BinTheTarget

Target = ['E_bin']
Features = ['SMILES']
Feature_types = ['SMILES']
input = df

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def objectiveRandomForestClassifier(trial, model_instance):
    """
    Objective function for Optuna to minimize.
    """
    # Define hyperparameters to tune
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_categorical('max_depth', [None, 10, 20, 30, 40]),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 15),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 6),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }

    # Clone the model to ensure a fresh instance each trial
    model_clone = clone(model_instance.model)
    model_clone.set_params(**params)
    
    # Define the score metric
    scoring = 'accuracy'

    # Perform cross-validation
    scores = cross_val_score(model_clone, model_instance.X_train, model_instance.y_train, cv=model_instance.cv, scoring=scoring)

    # Return the average score across all folds
    return scores.mean()


def objectiveXGBClassifier(trial, model_instance):
    """
    Objective function for Optuna to minimize for XGBClassifier.
    """
    # Define hyperparameters to tune
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),  # L1 regularization
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),  # L2 regularization
    }

    # Clone the model to ensure a fresh instance each trial
    model_clone = clone(model_instance.model)
    model_clone.set_params(**params)
    
    # Define the score metric
    scoring = 'accuracy'

    # Perform cross-validation
    scores = cross_val_score(model_clone, model_instance.X_train, model_instance.y_train, cv=model_instance.cv, scoring=scoring)

    # Return the average score across all folds
    return scores.mean()


In [5]:
MODEL_NAMES = ['RandomForestClassifier', 'XGBClassifier']
TARGETS = [[i] for i in Target ]
TRAIN_SIZES = [25, 50, 100, 200]

result = []
for seed in [1, 2, 3]:
    for model_name in MODEL_NAMES:
        for target in TARGETS:
            for train_size in TRAIN_SIZES:
                print(f'RUN: Model:{model_name} / Target:{target} / Train size:{train_size} / Seed:{seed}')
                
                if model_name == 'RandomForestClassifier':
                    objective = objectiveRandomForestClassifier
                elif model_name == 'XGBClassifier':
                    objective = objectiveXGBClassifier

                model = MLmodel(modelType=model_name, 
                df=input,
                randomSeed=seed,
                train_count = train_size,
                test_count = 50, 
                target=target, 
                features=Features, 
                hyperparameter_tuning=True,
                feature_types=Feature_types,
                optimization_method='optuna', 
                optimization_trials=10,
                objective=lambda trial: objective(trial, model)
                )

                model.train()
                eval, summary = model.evaluate()
                result.append({**summary, **eval, 'seed':seed})



[I 2024-09-17 10:59:49,095] A new study created in memory with name: no-name-28d8d670-1606-4dc5-bde0-a84c4a7acacb


RUN: Model:RandomForestClassifier / Target:['E_bin'] / Train size:25 / Seed:1


[I 2024-09-17 10:59:50,197] Trial 0 finished with value: 0.72 and parameters: {'n_estimators': 250, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 0 with value: 0.72.
[I 2024-09-17 10:59:51,615] Trial 1 finished with value: 0.64 and parameters: {'n_estimators': 211, 'max_depth': 20, 'min_samples_split': 11, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 0 with value: 0.72.
[I 2024-09-17 10:59:51,982] Trial 2 finished with value: 0.64 and parameters: {'n_estimators': 63, 'max_depth': None, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_features': 'log2', 'bootstrap': True}. Best is trial 0 with value: 0.72.
[I 2024-09-17 10:59:53,021] Trial 3 finished with value: 0.64 and parameters: {'n_estimators': 166, 'max_depth': 30, 'min_samples_split': 14, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 0 with value: 0.72.
[I 2024-09-17 10:59:53,935] 

RUN: Model:RandomForestClassifier / Target:['E_bin'] / Train size:50 / Seed:1


[I 2024-09-17 11:00:00,377] Trial 0 finished with value: 0.7000000000000001 and parameters: {'n_estimators': 237, 'max_depth': 30, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 0 with value: 0.7000000000000001.
[I 2024-09-17 11:00:01,894] Trial 1 finished with value: 0.74 and parameters: {'n_estimators': 245, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 1 with value: 0.74.
[I 2024-09-17 11:00:02,863] Trial 2 finished with value: 0.72 and parameters: {'n_estimators': 220, 'max_depth': 10, 'min_samples_split': 11, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 1 with value: 0.74.
[I 2024-09-17 11:00:03,635] Trial 3 finished with value: 0.64 and parameters: {'n_estimators': 179, 'max_depth': None, 'min_samples_split': 12, 'min_samples_leaf': 5, 'max_features': 'log2', 'bootstrap': False}. Best is trial 1 with value: 0.

RUN: Model:RandomForestClassifier / Target:['E_bin'] / Train size:100 / Seed:1


[I 2024-09-17 11:00:08,252] A new study created in memory with name: no-name-b3c041e3-b489-4d0d-9ed1-67bf55ed2560
[I 2024-09-17 11:00:09,903] Trial 0 finished with value: 0.67 and parameters: {'n_estimators': 273, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'log2', 'bootstrap': True}. Best is trial 0 with value: 0.67.
[I 2024-09-17 11:00:11,685] Trial 1 finished with value: 0.7 and parameters: {'n_estimators': 246, 'max_depth': 30, 'min_samples_split': 8, 'min_samples_leaf': 5, 'max_features': 'log2', 'bootstrap': True}. Best is trial 1 with value: 0.7.
[I 2024-09-17 11:00:11,949] Trial 2 finished with value: 0.7699999999999999 and parameters: {'n_estimators': 59, 'max_depth': 40, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': 'log2', 'bootstrap': False}. Best is trial 2 with value: 0.7699999999999999.
[I 2024-09-17 11:00:13,004] Trial 3 finished with value: 0.72 and parameters: {'n_estimators': 221, 'max_depth': 40, 'min_samples_spli

RUN: Model:RandomForestClassifier / Target:['E_bin'] / Train size:200 / Seed:1


[I 2024-09-17 11:00:17,908] A new study created in memory with name: no-name-680dedce-b664-4384-add6-3f02d906b65e
[I 2024-09-17 11:00:18,647] Trial 0 finished with value: 0.8150000000000001 and parameters: {'n_estimators': 152, 'max_depth': 40, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'log2', 'bootstrap': False}. Best is trial 0 with value: 0.8150000000000001.
[I 2024-09-17 11:00:20,486] Trial 1 finished with value: 0.7699999999999999 and parameters: {'n_estimators': 300, 'max_depth': 30, 'min_samples_split': 5, 'min_samples_leaf': 6, 'max_features': 'log2', 'bootstrap': True}. Best is trial 0 with value: 0.8150000000000001.
[I 2024-09-17 11:00:21,758] Trial 2 finished with value: 0.835 and parameters: {'n_estimators': 193, 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 2 with value: 0.835.
[I 2024-09-17 11:00:23,479] Trial 3 finished with value: 0.8099999999999999 and parameters: {'n_estim

RUN: Model:XGBClassifier / Target:['E_bin'] / Train size:25 / Seed:1


[I 2024-09-17 11:01:02,670] Trial 0 finished with value: 0.8 and parameters: {'n_estimators': 271, 'max_depth': 9, 'learning_rate': 0.10118370066597199, 'subsample': 0.60546187040602, 'colsample_bytree': 0.5840350017676692, 'gamma': 0.6954148586452794, 'reg_alpha': 0.0003920353582371209, 'reg_lambda': 0.0003123958149347385}. Best is trial 0 with value: 0.8.
[I 2024-09-17 11:01:22,343] Trial 1 finished with value: 0.8400000000000001 and parameters: {'n_estimators': 161, 'max_depth': 13, 'learning_rate': 0.11630523602185806, 'subsample': 0.5960048577648127, 'colsample_bytree': 0.7482304835642122, 'gamma': 0.0002974028882593881, 'reg_alpha': 0.4909918718936664, 'reg_lambda': 2.2815255803913534e-05}. Best is trial 1 with value: 0.8400000000000001.
[I 2024-09-17 11:01:49,472] Trial 2 finished with value: 0.76 and parameters: {'n_estimators': 153, 'max_depth': 4, 'learning_rate': 0.06789230172304099, 'subsample': 0.7167660915316689, 'colsample_bytree': 0.8002850860508115, 'gamma': 0.00050704

RUN: Model:XGBClassifier / Target:['E_bin'] / Train size:50 / Seed:1


[I 2024-09-17 11:05:30,890] Trial 0 finished with value: 0.8400000000000001 and parameters: {'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.05078224679429391, 'subsample': 0.6232213242057594, 'colsample_bytree': 0.9645395995016721, 'gamma': 0.04479475534115711, 'reg_alpha': 1.0485707591112221e-06, 'reg_lambda': 0.005516185205685083}. Best is trial 0 with value: 0.8400000000000001.
[I 2024-09-17 11:05:37,764] Trial 1 finished with value: 0.8400000000000001 and parameters: {'n_estimators': 150, 'max_depth': 14, 'learning_rate': 0.064416308784722, 'subsample': 0.9810493825737376, 'colsample_bytree': 0.9938474138177816, 'gamma': 0.024471379041587235, 'reg_alpha': 6.887768530693431e-05, 'reg_lambda': 5.3484172043224405e-06}. Best is trial 0 with value: 0.8400000000000001.
[I 2024-09-17 11:06:06,396] Trial 2 finished with value: 0.8400000000000001 and parameters: {'n_estimators': 278, 'max_depth': 7, 'learning_rate': 0.021206790586638886, 'subsample': 0.6498342856323154, 'colsample_

KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(result)
display(df)
df.to_csv('ActivationEnergy_cycloadditions_traditional_ml.csv')

In [None]:
import matplotlib.pyplot as plt

METRICS = ['accuracy','f1_micro','f1_macro', 'kappa']
fig, ax = plt.subplots(len(METRICS), 1, sharex=True, layout = 'constrained')
ax = ax.flatten()

for i, metric in enumerate(METRICS):
    ax[i].plot(df['train_size'], df[metric], marker = 'o')
    ax[i].set_title(metric)
    ax[i].set_ylim(0,1)

fig.suptitle('Random Forrest & XGB - ActivationEnergy_cycloadditions')