In [1]:
from polyid.preprocessors import PolymerPreprocessor
from polyid import MultiModel, Parameters
from polyid.models import global100

from nfp.preprocessing.features import atom_features_v1, bond_features_v1
from model_utils import bond_featurizer

import pandas as pd
import numpy as np

# Generate Synthetic data for Example

In [2]:
# training data
dftrain = pd.read_csv('../data/example_polymer_smiles_train.csv',index_col=0)
dftrain['fake_Tg'] = np.random.uniform(low=-100, 
                            high=250, 
                            size=(dftrain.shape[0],))
dftrain['fake_Tm'] = dftrain.fake_Tg*0.62+100
dftrain.to_csv('../data/example_polymer_data_train.csv')

# test data
dftest = pd.read_csv('../data/example_polymer_smiles_test.csv',index_col=0)
dftest['fake_Tg'] = np.random.uniform(low=-100, 
                            high=250, 
                            size=(dftest.shape[0],))
dftest['fake_Tm'] = dftest.fake_Tg*0.62+100
dftest.to_csv('../data/example_polymer_data_test.csv')

# Generation and Training of Data
This notebook highlights an example workflow to use polyID to make a machine learning model and make predictions for Tg and Tm of polymers.

# Example Training using Only Structure

In [3]:
# Generate model parameters
# Paremeters has default values that can be changed
params = Parameters()
params.prediction_columns = ["fake_Tg", "fake_Tm"]
print(params.to_dict())

# Create the MultiModel class that manages multiple SingleModels
mm = MultiModel()

# First load data in, specifying prediction columns as well
mm.load_dataset('../data/example_polymer_data_train.csv', prediction_columns=params.prediction_columns)

# Split the data up into kfolds and generate the model classes
mm.split_data(kfolds=params.kfolds)

# # Scale the data. This scales using the entire data set and then scales each individual model with that scaler
mm.generate_data_scaler()

# # Generate the preprocessors for each model
# Here we use a preprocessor that uses just smiles
mm.generate_preprocessors(preprocessor=PolymerPreprocessor, atom_features=atom_features_v1, bond_features=bond_features_v1)

# # Train the models
mm.train_models(modelbuilder=global100, model_params=params.to_dict(), save_folder="save_example", save_training=True)

{'kfolds': 3, 'prediction_columns': ['fake_Tg', 'fake_Tm'], 'atom_features': 32, 'mol_features': 8, 'num_messages': 2, 'batch_size': 64, 'epochs': 5, 'learning_rate': 0.0005, 'dropout': 0.05, 'decay': 1e-05, 'bond_features': 32}


2022-08-24 08:53:27.663475: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


<bound method Model.summary of <keras.engine.functional.Functional object at 0x7fa718105690>>
Epoch 1/5
    300/Unknown - 13s 15ms/step - loss: 78.3854
Epoch 1: val_loss improved from inf to 89.32783, saving model to save_example/model_0/model_0.h5
Epoch 2/5
Epoch 2: val_loss improved from 89.32783 to 76.84061, saving model to save_example/model_0/model_0.h5
Epoch 3/5
Epoch 3: val_loss did not improve from 76.84061
Epoch 4/5
Epoch 4: val_loss did not improve from 76.84061
Epoch 5/5
Epoch 5: val_loss improved from 76.84061 to 74.15756, saving model to save_example/model_0/model_0.h5
<bound method Model.summary of <keras.engine.functional.Functional object at 0x7fa719a0f650>>
Epoch 1/5
    313/Unknown - 12s 13ms/step - loss: 85.1551
Epoch 1: val_loss improved from inf to 68.01556, saving model to save_example/model_1/model_1.h5
Epoch 2/5
Epoch 2: val_loss did not improve from 68.01556
Epoch 3/5
Epoch 3: val_loss improved from 68.01556 to 67.97594, saving model to save_example/model_1/mod