This notebook creates polymer structures from a database of polymer properities and splits the data into a training/validation and test sets.

In [1]:
import pandas as pd
import numpy as np
from m2p import PolyMaker

from sklearn.model_selection import train_test_split
from polyid import generate_hash
from polyid_utils import Utils

pm = PolyMaker()
DP = 18 # set to 8 for demonstration, optimal value found to be 18 or greater

2023-08-12 16:01:38.835572: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-12 16:01:39.183539: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.2/lib64::/home/wilsoa6/miniconda3/envs/stonks/lib/
2023-08-12 16:01:39.183627: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.2/lib64::/home/wilsoa6/miniconda

In [2]:
# read in data
data = pd.read_csv('../data/SI_Table-of-polymer-properties.csv',index_col=0)

# generate monomers tuple
data['monomers'] = data.smiles_monomer.apply(lambda smi: pm.get_monomers(smi,stereochemistry=False))

# pivot and aggregrate data into train/val/test format
data = data.pivot_table(index=['smiles_monomer','monomers','mechanism'],
           columns='property',
           values='value',
           aggfunc=np.mean,
           ).reset_index()

# get log of permeability columns
cols_log = data.keys()[data.keys().str.contains('Permeability')].to_list()
data[cols_log] = data[cols_log].apply(np.log10)
data = data.rename({col:'log10_'+col for col in cols_log},axis=1)

# generate polymer structures with DP=18 and 1 replicate stucture
data = pm.thermoplastic(data, DP=DP,mechanism='all',replicate_structures=1)

# remove any polymers which may have resulted in an error during reaction
dferror = data[data.smiles_polymer.str.contains('ERR')]
data = data[~data.smiles_polymer.str.contains('ERR')]

# hash columns for tracking predictions and structures
data = generate_hash(data,hash_cols=['monomers','replicate_structure'])
data = generate_hash(data.reset_index(),hash_cols=['monomers'])

# split out train and test with stratification across mechanism (polymer type)s
dftrain, dftest = train_test_split(data,
                            test_size=0.2,
                            stratify=data['mechanism'],
                            random_state=0)

# verify no polymers had error from polymerization
assert dftrain[dftrain.smiles_polymer.str.contains('ERR')].shape[0]==0
assert dftest[dftest.smiles_polymer.str.contains('ERR')].shape[0]==0

# save
dftrain.to_csv('../data/dftrain.csv')
dftest.to_csv('../data/dftest.csv')

100%|██████████| 994/994 [08:30<00:00,  1.95it/s]
