## Traditional machine learning
Feature(s): 'SMILES' column
    translate to morgan fingerprint

target: mp_bin
    binary class of melting points

In [1]:
import sys
import os

# Append the parent directory of your package to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..', '..')))


Read the dataset from the zip file

In [2]:
import pandas as pd
import numpy as np
import zipfile

path_to_dataset = 'train_meltingPoint_noDuplicates.zip'
csv_filename = 'train_meltingPoint_noDuplicates.csv'

# Open the file, Correct the encoding and sep if necessary
if path_to_dataset.endswith('.zip'):
    with zipfile.ZipFile(path_to_dataset, 'r') as z:
        # Open the CSV file within the ZIP file
        with z.open(csv_filename) as f:
            # Read the CSV file into a DataFrame
            df = pd.read_csv(f, sep=',', on_bad_lines='warn', index_col = 0)
else:
    # Read the CSV file into a DataFrame
    df = pd.read_csv(path_to_dataset, sep=',', on_bad_lines='warn', index_col = 0)


print('Count of unique smiles:', df.SMILES.unique().shape[0])
print('Count of all of the smiles:', df.shape[0])



Count of unique smiles: 273237
Count of all of the smiles: 273237


In [3]:
from MLPipeline import MLmodel, BinTheTarget

Target = ['mp_bin']
Features = ['SMILES']
Feature_types = ['SMILES']
input = df[:7000]

  from .autonotebook import tqdm as notebook_tqdm


The Random forest classifier model without hyperparameter optimization

In [4]:
model = MLmodel(modelType='RandomForestClassifier',
                    df=input,
                    target=Target,
                    features=Features,
                    feature_types=Feature_types)

# get the values (input and output) of the model
X_train, X_test, y_train, y_test = model.getValues()

[17:08:59] Explicit valence for atom # 10 S, 9, is greater than permitted
[17:09:01] Explicit valence for atom # 16 S, 9, is greater than permitted
[17:09:01] Explicit valence for atom # 10 S, 9, is greater than permitted
[32m2024-09-12 17:09:03.046[0m | [1mINFO    [0m | [36mMLPipeline[0m:[36m__post_init__[0m:[36m133[0m - [1mndim y_train: 1[0m
[32m2024-09-12 17:09:03.047[0m | [1mINFO    [0m | [36mMLPipeline[0m:[36m__post_init__[0m:[36m134[0m - [1mndim x_train: 2[0m
[32m2024-09-12 17:09:03.047[0m | [1mINFO    [0m | [36mMLPipeline[0m:[36m__post_init__[0m:[36m135[0m - [1mshape y_train: (50,)[0m
[32m2024-09-12 17:09:03.048[0m | [1mINFO    [0m | [36mMLPipeline[0m:[36m__post_init__[0m:[36m136[0m - [1mshape x_train: (50, 512)[0m


Some information about the dataset such as the splitting, wrong smiles, total number of samples.

In [5]:
number_of_samples, number_of_wrong_smiles, clean_df = model.getdfAnalysis(orginal_df=input)
print('Number of samples:', number_of_samples)
print('Number of wrong smiles:', number_of_wrong_smiles)


for i in range(len(clean_df.mp_bin.value_counts())):
    print(f'bin {i} fraction in original set:' , 
          clean_df.mp_bin.value_counts()[i]/clean_df.mp_bin.value_counts().sum())

for i in range (len(pd.DataFrame(y_train).value_counts())):
    print(f'bin {i} fraction in training set:' , 
          pd.DataFrame(y_train).value_counts()[i]/pd.DataFrame(y_train).value_counts().sum())

for i in range(len(pd.DataFrame(y_test).value_counts())):
    print(f'bin {i} fraction in test set:' , 
          pd.DataFrame(y_test).value_counts()[i]/pd.DataFrame(y_test).value_counts().sum())

Number of samples: 7000
Number of wrong smiles: 3
bin 0 fraction in original set: 0.5149349721309132
bin 1 fraction in original set: 0.48506502786908673
bin 0 fraction in training set: 0.48
bin 1 fraction in training set: 0.52
bin 0 fraction in test set: 0.42
bin 1 fraction in test set: 0.58


In [6]:
model.train()
model.evaluate()

RandomForestClassifier model trained successfully.
Accuracy for RandomForestClassifier: 0.58


0.58

Train the random forest classifier using hyperparameter optimization with optuna

In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.base import clone

def objective(trial, model_instance):
    """
    Objective function for Optuna to minimize.
    """
    # Define hyperparameters to tune
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_categorical('max_depth', [None, 10, 20, 30, 40]),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 15),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 6),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }

    # Clone the model to ensure a fresh instance each trial
    model_clone = clone(model_instance.model)
    model_clone.set_params(**params)
    
    # Define the score metric
    scoring = 'accuracy'

    # Perform cross-validation
    scores = cross_val_score(model_clone, model_instance.X_train, model_instance.y_train, cv=model_instance.cv, scoring=scoring)

    # Return the average score across all folds
    return scores.mean()

In [8]:
model = MLmodel(modelType='RandomForestClassifier', df=input, target=Target, 
                features=['SMILES'], hyperparameter_tuning=True,
                feature_types=Feature_types,
                optimization_method='optuna', objective=lambda trial: objective(trial, model))

model.train()
predictions = model.predict()
model.evaluate()

[17:09:06] Explicit valence for atom # 10 S, 9, is greater than permitted
[17:09:08] Explicit valence for atom # 16 S, 9, is greater than permitted
[17:09:09] Explicit valence for atom # 10 S, 9, is greater than permitted
[32m2024-09-12 17:09:10.413[0m | [1mINFO    [0m | [36mMLPipeline[0m:[36m__post_init__[0m:[36m133[0m - [1mndim y_train: 1[0m
[32m2024-09-12 17:09:10.414[0m | [1mINFO    [0m | [36mMLPipeline[0m:[36m__post_init__[0m:[36m134[0m - [1mndim x_train: 2[0m
[32m2024-09-12 17:09:10.415[0m | [1mINFO    [0m | [36mMLPipeline[0m:[36m__post_init__[0m:[36m135[0m - [1mshape y_train: (50,)[0m
[32m2024-09-12 17:09:10.415[0m | [1mINFO    [0m | [36mMLPipeline[0m:[36m__post_init__[0m:[36m136[0m - [1mshape x_train: (50, 512)[0m
[I 2024-09-12 17:09:10,416] A new study created in memory with name: no-name-7d09ac84-9333-4061-9a56-a98c03c2a5c4
[I 2024-09-12 17:09:11,539] Trial 0 finished with value: 0.6599999999999999 and parameters: {'n_estimators

Best RandomForestClassifier model trained successfully with hyperparameter tuning using Optuna.
Best hyperparameters: {'n_estimators': 219, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_features': 'log2', 'bootstrap': False}
RandomForestClassifier model trained successfully.
Accuracy for RandomForestClassifier: 0.56


0.56