## Traditional machine learning
Feature(s): 'SMILES' column

- convert to morgan fingerprint

Target: E_coh_bin,T_g_bin,R_gyr_bin,Densities_bin

- 4 targets
- each a binary class of a monomer property

In [41]:
import sys
import os

# Append the parent directory of your package to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..', '..')))

In [42]:
import pandas as pd
import numpy as np
import zipfile

path_to_dataset = 'train_ben.csv'
csv_filename = 'train_ben.csv'

# Open the file, Correct the encoding and sep if necessary
if path_to_dataset.endswith('.zip'):
    with zipfile.ZipFile(path_to_dataset, 'r') as z:
        # Open the CSV file within the ZIP file
        with z.open(csv_filename) as f:
            # Read the CSV file into a DataFrame
            df = pd.read_csv(f, sep=',', on_bad_lines='warn', index_col = 0)
else:
    # Read the CSV file into a DataFrame
    df = pd.read_csv(path_to_dataset, sep=',', on_bad_lines='warn', index_col = 0)


print('Count of unique smiles:', df.SMILES.unique().shape[0])
print('Count of all of the smiles:', df.shape[0])

Count of unique smiles: 410
Count of all of the smiles: 410


In [43]:
from MLPipeline import MLmodel, BinTheTarget

Target = ['E_coh_bin','T_g_bin','R_gyr_bin','Densities_bin']
Features = ['SMILES']
Feature_types = ['SMILES']
input = df

In [44]:
model = MLmodel(modelType='RandomForestClassifier',
                    df=input,
                    target=Target,
                    features=Features,
                    feature_types=Feature_types)

# get the values (input and output) of the model
X_train, X_test, y_train, y_test = model.getValues()

2024-09-16 17:03:57.251 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 2
2024-09-16 17:03:57.252 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:03:57.252 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (50, 4)
2024-09-16 17:03:57.253 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (50, 512)


In [45]:
model.train()
model.evaluate()

RandomForestClassifier model trained successfully.
Accuracies for each target in RandomForestClassifier: [0.68, 0.64, 0.76, 0.62]


[0.68, 0.64, 0.76, 0.62]

In [46]:
from sklearn.model_selection import cross_val_score
from sklearn.base import clone

def objective(trial, model_instance):
    """
    Objective function for Optuna to minimize.
    """
    # Define hyperparameters to tune
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_categorical('max_depth', [None, 10, 20, 30, 40]),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 15),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 6),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }


    # Clone the model to ensure a fresh instance each trial
    model_to_clone = model_instance.model.estimator
    model_clone = clone(model_to_clone)
    model_clone.set_params(**params)
    
    # Define the score metric
    scoring = 'accuracy'

    # Perform cross-validation
    scores = cross_val_score(model_clone, model_instance.X_train, model_instance.y_train, cv=model_instance.cv, scoring=scoring)

    # Return the average score across all folds
    return scores.mean()

In [47]:
from sklearn.metrics import f1_score, cohen_kappa_score, accuracy_score

def train_test(modeltype = 'RandomForestClassifier', target = None, train_size = 50, random_seed = 42):
    summary = {}

    model = MLmodel(modelType=modeltype, 
                df=df,
                train_count = train_size,
                test_count = 50, 
                target=target, 
                features=Features, 
                hyperparameter_tuning=False,
                feature_types=Feature_types,
                optimization_method='optuna', 
                objective=lambda trial: objective(trial, model), 
                randomSeed = random_seed
                )
    
    model.train()
    predictions = model.predict()
    model.evaluate()

    summary['modeltype'] = modeltype
    summary['target'] = target
    summary['train_size'] = train_size
    summary['trues'] = model.y_test
    summary['preds'] = predictions
    summary['model_params'] = model.model.get_params()

    summary['accuracy'] = accuracy_score(model.y_test, predictions)
    summary['f1_micro'] = f1_score(model.y_test, predictions, average='micro')
    summary['f1_macro'] = f1_score(model.y_test, predictions, average='macro')
    summary['kappa'] = cohen_kappa_score(model.y_test, predictions)

    return summary
    

   
MODELS = ['RandomForestClassifier', 'XGBClassifier']
TARGETS = [['E_coh_bin'],['T_g_bin'],['R_gyr_bin'],['Densities_bin']] #why list?
TRAIN_SIZES = [25, 50]

all_res = []
for seed in [1,2,3]:
    for model in MODELS:
        for target in TARGETS:
            for train_size in TRAIN_SIZES:
                Features = ['SMILES']
                print(f'RUN: {model} / {target} / {train_size}')
                res = train_test(modeltype = model, target = target, train_size =train_size, random_seed = seed)
                res.update({'seed':seed})
                all_res.append(res)

2024-09-16 17:03:57.625 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:03:57.626 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:03:57.626 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (25,)
2024-09-16 17:03:57.627 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (25, 512)


RUN: RandomForestClassifier / ['E_coh_bin'] / 25
RandomForestClassifier model trained successfully.
Accuracy for RandomForestClassifier: 0.52
RUN: RandomForestClassifier / ['E_coh_bin'] / 50


2024-09-16 17:03:57.821 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:03:57.821 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:03:57.821 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (50,)
2024-09-16 17:03:57.822 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (50, 512)


RandomForestClassifier model trained successfully.
Accuracy for RandomForestClassifier: 0.54


2024-09-16 17:03:58.018 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:03:58.018 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:03:58.019 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (25,)
2024-09-16 17:03:58.019 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (25, 512)


RUN: RandomForestClassifier / ['T_g_bin'] / 25
RandomForestClassifier model trained successfully.
Accuracy for RandomForestClassifier: 0.74
RUN: RandomForestClassifier / ['T_g_bin'] / 50


2024-09-16 17:03:58.210 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:03:58.210 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:03:58.210 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (50,)
2024-09-16 17:03:58.211 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (50, 512)


RandomForestClassifier model trained successfully.
Accuracy for RandomForestClassifier: 0.72
RUN: RandomForestClassifier / ['R_gyr_bin'] / 25


2024-09-16 17:03:58.417 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:03:58.417 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:03:58.418 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (25,)
2024-09-16 17:03:58.418 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (25, 512)
2024-09-16 17:03:58.623 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:03:58.623 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:03:58.623 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (50,)
2024-09-16 17:03:58.624 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (50, 512)


RandomForestClassifier model trained successfully.
Accuracy for RandomForestClassifier: 0.76
RUN: RandomForestClassifier / ['R_gyr_bin'] / 50


2024-09-16 17:03:58.820 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:03:58.821 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:03:58.821 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (25,)
2024-09-16 17:03:58.822 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (25, 512)


RandomForestClassifier model trained successfully.
Accuracy for RandomForestClassifier: 0.78
RUN: RandomForestClassifier / ['Densities_bin'] / 25
RandomForestClassifier model trained successfully.


2024-09-16 17:03:59.021 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:03:59.021 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:03:59.022 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (50,)
2024-09-16 17:03:59.022 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (50, 512)


Accuracy for RandomForestClassifier: 0.64
RUN: RandomForestClassifier / ['Densities_bin'] / 50
RandomForestClassifier model trained successfully.
Accuracy for RandomForestClassifier: 0.70


2024-09-16 17:03:59.225 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:03:59.226 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:03:59.226 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (25,)
2024-09-16 17:03:59.226 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (25, 512)


RUN: XGBClassifier / ['E_coh_bin'] / 25
XGBClassifier model trained successfully.
Accuracy for XGBClassifier: 0.42
RUN: XGBClassifier / ['E_coh_bin'] / 50


2024-09-16 17:03:59.413 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:03:59.413 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:03:59.414 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (50,)
2024-09-16 17:03:59.414 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (50, 512)
2024-09-16 17:03:59.648 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:03:59.649 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:03:59.649 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (25,)
2024-09-16 17:03:59.650 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (25, 512)


XGBClassifier model trained successfully.
Accuracy for XGBClassifier: 0.58
RUN: XGBClassifier / ['T_g_bin'] / 25


2024-09-16 17:03:59.856 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:03:59.857 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:03:59.857 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (50,)
2024-09-16 17:03:59.857 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (50, 512)


XGBClassifier model trained successfully.
Accuracy for XGBClassifier: 0.68
RUN: XGBClassifier / ['T_g_bin'] / 50
XGBClassifier model trained successfully.
Accuracy for XGBClassifier: 0.68
RUN: XGBClassifier / ['R_gyr_bin'] / 25


2024-09-16 17:04:00.050 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:00.050 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:00.051 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (25,)
2024-09-16 17:04:00.051 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (25, 512)
2024-09-16 17:04:00.242 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:00.242 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:00.243 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (50,)
2024-09-16 17:04:00.243 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (50, 512)


XGBClassifier model trained successfully.
Accuracy for XGBClassifier: 0.80
RUN: XGBClassifier / ['R_gyr_bin'] / 50


2024-09-16 17:04:00.453 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:00.454 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:00.455 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (25,)
2024-09-16 17:04:00.455 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (25, 512)


XGBClassifier model trained successfully.
Accuracy for XGBClassifier: 0.86
RUN: XGBClassifier / ['Densities_bin'] / 25


2024-09-16 17:04:00.650 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:00.651 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:00.651 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (50,)
2024-09-16 17:04:00.652 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (50, 512)


XGBClassifier model trained successfully.
Accuracy for XGBClassifier: 0.70
RUN: XGBClassifier / ['Densities_bin'] / 50


2024-09-16 17:04:00.858 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:00.858 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:00.859 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (25,)
2024-09-16 17:04:00.859 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (25, 512)


XGBClassifier model trained successfully.
Accuracy for XGBClassifier: 0.68
RUN: RandomForestClassifier / ['E_coh_bin'] / 25
RandomForestClassifier model trained successfully.
Accuracy for RandomForestClassifier: 0.52
RUN: RandomForestClassifier / ['E_coh_bin'] / 50


2024-09-16 17:04:01.057 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:01.058 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:01.058 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (50,)
2024-09-16 17:04:01.059 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (50, 512)
2024-09-16 17:04:01.258 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:01.259 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:01.259 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (25,)
2024-09-16 17:04:01.260 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (25, 512)


RandomForestClassifier model trained successfully.
Accuracy for RandomForestClassifier: 0.62
RUN: RandomForestClassifier / ['T_g_bin'] / 25
RandomForestClassifier model trained successfully.


2024-09-16 17:04:01.451 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:01.451 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:01.452 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (50,)
2024-09-16 17:04:01.452 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (50, 512)


Accuracy for RandomForestClassifier: 0.64
RUN: RandomForestClassifier / ['T_g_bin'] / 50
RandomForestClassifier model trained successfully.
Accuracy for RandomForestClassifier: 0.68
RUN: RandomForestClassifier / ['R_gyr_bin'] / 25


2024-09-16 17:04:01.646 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:01.647 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:01.647 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (25,)
2024-09-16 17:04:01.647 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (25, 512)
2024-09-16 17:04:01.843 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:01.843 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:01.844 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (50,)
2024-09-16 17:04:01.844 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (50, 512)


RandomForestClassifier model trained successfully.
Accuracy for RandomForestClassifier: 0.60
RUN: RandomForestClassifier / ['R_gyr_bin'] / 50
RandomForestClassifier model trained successfully.


2024-09-16 17:04:02.036 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:02.037 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:02.037 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (25,)
2024-09-16 17:04:02.037 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (25, 512)


Accuracy for RandomForestClassifier: 0.76
RUN: RandomForestClassifier / ['Densities_bin'] / 25
RandomForestClassifier model trained successfully.
Accuracy for RandomForestClassifier: 0.54
RUN: RandomForestClassifier / ['Densities_bin'] / 50


2024-09-16 17:04:02.226 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:02.226 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:02.227 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (50,)
2024-09-16 17:04:02.227 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (50, 512)
2024-09-16 17:04:02.418 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:02.419 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:02.419 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (25,)
2024-09-16 17:04:02.419 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (25, 512)


RandomForestClassifier model trained successfully.
Accuracy for RandomForestClassifier: 0.68
RUN: XGBClassifier / ['E_coh_bin'] / 25
XGBClassifier model trained successfully.
Accuracy for XGBClassifier: 0.64
RUN: XGBClassifier / ['E_coh_bin'] / 50


2024-09-16 17:04:02.605 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:02.606 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:02.606 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (50,)
2024-09-16 17:04:02.607 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (50, 512)
2024-09-16 17:04:02.763 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:02.763 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:02.764 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (25,)
2024-09-16 17:04:02.764 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (25, 512)


XGBClassifier model trained successfully.
Accuracy for XGBClassifier: 0.52
RUN: XGBClassifier / ['T_g_bin'] / 25


2024-09-16 17:04:02.997 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:02.997 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:02.997 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (50,)
2024-09-16 17:04:02.998 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (50, 512)


XGBClassifier model trained successfully.
Accuracy for XGBClassifier: 0.66
RUN: XGBClassifier / ['T_g_bin'] / 50


2024-09-16 17:04:03.293 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:03.293 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:03.293 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (25,)
2024-09-16 17:04:03.294 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (25, 512)


XGBClassifier model trained successfully.
Accuracy for XGBClassifier: 0.58
RUN: XGBClassifier / ['R_gyr_bin'] / 25
XGBClassifier model trained successfully.
Accuracy for XGBClassifier: 0.68
RUN: XGBClassifier / ['R_gyr_bin'] / 50


2024-09-16 17:04:03.471 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:03.472 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:03.472 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (50,)
2024-09-16 17:04:03.472 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (50, 512)
2024-09-16 17:04:03.675 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:03.676 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:03.676 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (25,)
2024-09-16 17:04:03.677 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (25, 512)


XGBClassifier model trained successfully.
Accuracy for XGBClassifier: 0.76
RUN: XGBClassifier / ['Densities_bin'] / 25


2024-09-16 17:04:03.915 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:03.916 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:03.916 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (50,)
2024-09-16 17:04:03.917 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (50, 512)


XGBClassifier model trained successfully.
Accuracy for XGBClassifier: 0.48
RUN: XGBClassifier / ['Densities_bin'] / 50
XGBClassifier model trained successfully.
Accuracy for XGBClassifier: 0.68
RUN: RandomForestClassifier / ['E_coh_bin'] / 25


2024-09-16 17:04:04.079 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:04.080 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:04.080 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (25,)
2024-09-16 17:04:04.080 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (25, 512)
2024-09-16 17:04:04.269 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:04.269 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:04.270 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (50,)
2024-09-16 17:04:04.270 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (50, 512)


RandomForestClassifier model trained successfully.
Accuracy for RandomForestClassifier: 0.64
RUN: RandomForestClassifier / ['E_coh_bin'] / 50
RandomForestClassifier model trained successfully.
Accuracy for RandomForestClassifier: 0.58
RUN: RandomForestClassifier / ['T_g_bin'] / 25


2024-09-16 17:04:04.463 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:04.464 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:04.464 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (25,)
2024-09-16 17:04:04.464 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (25, 512)
2024-09-16 17:04:04.654 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:04.654 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:04.655 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (50,)
2024-09-16 17:04:04.655 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (50, 512)


RandomForestClassifier model trained successfully.
Accuracy for RandomForestClassifier: 0.52
RUN: RandomForestClassifier / ['T_g_bin'] / 50
RandomForestClassifier model trained successfully.
Accuracy for RandomForestClassifier: 0.62
RUN: RandomForestClassifier / ['R_gyr_bin'] / 25


2024-09-16 17:04:04.846 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:04.847 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:04.847 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (25,)
2024-09-16 17:04:04.847 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (25, 512)
2024-09-16 17:04:05.035 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:05.036 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:05.036 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (50,)
2024-09-16 17:04:05.037 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (50, 512)


RandomForestClassifier model trained successfully.
Accuracy for RandomForestClassifier: 0.50
RUN: RandomForestClassifier / ['R_gyr_bin'] / 50
RandomForestClassifier model trained successfully.
Accuracy for RandomForestClassifier: 0.76
RUN: RandomForestClassifier / ['Densities_bin'] / 25


2024-09-16 17:04:05.232 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:05.233 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:05.234 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (25,)
2024-09-16 17:04:05.234 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (25, 512)
2024-09-16 17:04:05.427 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:05.428 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:05.428 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (50,)
2024-09-16 17:04:05.429 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (50, 512)


RandomForestClassifier model trained successfully.
Accuracy for RandomForestClassifier: 0.68
RUN: RandomForestClassifier / ['Densities_bin'] / 50
RandomForestClassifier model trained successfully.


2024-09-16 17:04:05.624 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:05.625 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:05.625 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (25,)
2024-09-16 17:04:05.626 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (25, 512)


Accuracy for RandomForestClassifier: 0.62
RUN: XGBClassifier / ['E_coh_bin'] / 25
XGBClassifier model trained successfully.
Accuracy for XGBClassifier: 0.66
RUN: XGBClassifier / ['E_coh_bin'] / 50


2024-09-16 17:04:05.814 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:05.815 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:05.815 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (50,)
2024-09-16 17:04:05.816 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (50, 512)
2024-09-16 17:04:05.984 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:05.984 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:05.985 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (25,)
2024-09-16 17:04:05.985 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (25, 512)


XGBClassifier model trained successfully.
Accuracy for XGBClassifier: 0.64
RUN: XGBClassifier / ['T_g_bin'] / 25
XGBClassifier model trained successfully.
Accuracy for XGBClassifier: 0.62
RUN: XGBClassifier / ['T_g_bin'] / 50


2024-09-16 17:04:06.146 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:06.146 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:06.147 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (50,)
2024-09-16 17:04:06.147 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (50, 512)
2024-09-16 17:04:06.315 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:06.316 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:06.316 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (25,)
2024-09-16 17:04:06.316 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (25, 512)


XGBClassifier model trained successfully.
Accuracy for XGBClassifier: 0.66
RUN: XGBClassifier / ['R_gyr_bin'] / 25
XGBClassifier model trained successfully.
Accuracy for XGBClassifier: 0.60
RUN: XGBClassifier / ['R_gyr_bin'] / 50


2024-09-16 17:04:06.507 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:06.508 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:06.509 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (50,)
2024-09-16 17:04:06.509 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (50, 512)
2024-09-16 17:04:06.729 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:06.730 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:06.730 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (25,)
2024-09-16 17:04:06.731 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (25, 512)


XGBClassifier model trained successfully.
Accuracy for XGBClassifier: 0.72
RUN: XGBClassifier / ['Densities_bin'] / 25


2024-09-16 17:04:06.952 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 1
2024-09-16 17:04:06.953 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:06.953 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (50,)
2024-09-16 17:04:06.953 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (50, 512)


XGBClassifier model trained successfully.
Accuracy for XGBClassifier: 0.78
RUN: XGBClassifier / ['Densities_bin'] / 50
XGBClassifier model trained successfully.
Accuracy for XGBClassifier: 0.62


In [48]:
df = pd.DataFrame(all_res)
display(df)

Unnamed: 0,modeltype,target,train_size,trues,preds,model_params,accuracy,f1_micro,f1_macro,kappa,seed
0,RandomForestClassifier,[E_coh_bin],25,"[0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, ...","[0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, ...","{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.52,0.52,0.448529,0.013158,1
1,RandomForestClassifier,[E_coh_bin],50,"[1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, ...","[1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, ...","{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.54,0.54,0.539816,0.097331,1
2,RandomForestClassifier,[T_g_bin],25,"[1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.74,0.74,0.706015,0.442539,1
3,RandomForestClassifier,[T_g_bin],50,"[1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, ...","[1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, ...","{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.72,0.72,0.715909,0.44,1
4,RandomForestClassifier,[R_gyr_bin],25,"[1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, ...","[0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, ...","{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.76,0.76,0.724265,0.464286,1
5,RandomForestClassifier,[R_gyr_bin],50,"[0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, ...","[0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, ...","{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.78,0.78,0.775602,0.552846,1
6,RandomForestClassifier,[Densities_bin],25,"[0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, ...","[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.64,0.64,0.553571,0.186257,1
7,RandomForestClassifier,[Densities_bin],50,"[1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...","{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.7,0.7,0.660787,0.34555,1
8,XGBClassifier,[E_coh_bin],25,"[0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, ...","[1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, ...","{'objective': 'binary:logistic', 'use_label_en...",0.42,0.42,0.400579,-0.175041,1
9,XGBClassifier,[E_coh_bin],50,"[1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, ...","[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, ...","{'objective': 'binary:logistic', 'use_label_en...",0.58,0.58,0.579832,0.175824,1


In [49]:
model = MLmodel(modelType='RandomForestClassifier', df=input, target=Target, 
                features=['SMILES'], hyperparameter_tuning=True,
                feature_types=Feature_types,
                optimization_method='optuna', objective=lambda trial: objective(trial, model))

model.train()
predictions = model.predict()
model.evaluate()

2024-09-16 17:04:07.287 | INFO     | MLPipeline:__post_init__:134 - ndim y_train: 2
2024-09-16 17:04:07.288 | INFO     | MLPipeline:__post_init__:135 - ndim x_train: 2
2024-09-16 17:04:07.288 | INFO     | MLPipeline:__post_init__:136 - shape y_train: (50, 4)
2024-09-16 17:04:07.288 | INFO     | MLPipeline:__post_init__:137 - shape x_train: (50, 512)
[32m[I 2024-09-16 17:04:07,289][0m A new study created in memory with name: no-name-22c56ed3-2352-4c52-8d9b-40128a7425a2[0m
[32m[I 2024-09-16 17:04:07,679][0m Trial 0 finished with value: 0.13999999999999999 and parameters: {'n_estimators': 146, 'max_depth': 40, 'min_samples_split': 11, 'min_samples_leaf': 4, 'max_features': 'log2', 'bootstrap': True}. Best is trial 0 with value: 0.13999999999999999.[0m
[32m[I 2024-09-16 17:04:08,068][0m Trial 1 finished with value: 0.08 and parameters: {'n_estimators': 147, 'max_depth': 40, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_features': 'log2', 'bootstrap': True}. Best is trial 0 wi

Best RandomForestClassifier model trained successfully with hyperparameter tuning using Optuna.
Best hyperparameters: {'n_estimators': 213, 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 4, 'max_features': 'log2', 'bootstrap': True}
RandomForestClassifier model trained successfully.
Accuracies for each target in RandomForestClassifier: [0.62, 0.62, 0.72, 0.74]


[0.62, 0.62, 0.72, 0.74]