In [1]:
import pandas as pd
from m2p import PolyMaker

from sklearn.model_selection import train_test_split
from polyid import generate_hash

pm = PolyMaker()

In [2]:
# import data
data = pd.read_csv('../data/example_monomer_smiles.csv',index_col=0)

# cannonicalize smiles using rdkit
data = pd.DataFrame(data.smiles_monomer.apply(pm.checksmile))

# generate monomers tuple
data['monomers'] = data.smiles_monomer.apply(pm.get_monomers)

# # generate polymer structures with DP=8 and 2 replicate stuctures
data = pm.thermoplastic(data,DP=8,mechanism='all',replicate_structures=2)

# remove any polymers which may have resulted in an error during reaction
dferror = data[data.smiles_polymer.str.contains('ERR')]
data = data[~data.smiles_polymer.str.contains('ERR')]

# hash columns for tracking predictions and structures
data = generate_hash(data,hash_cols=['monomers','replicate_structure'])
data = generate_hash(data.reset_index(),hash_cols=['monomers'])

# split out train and test with stratification across mechanism (polymer type)
data_0 = data[data.replicate_structure==0]
dftrain_0, dftest_0 = train_test_split(data_0,
                            test_size=0.2,
                            stratify=data_0['mechanism'],
                            random_state=0)

dftrain = data.loc[dftrain_0.index]
dftest = data.loc[dftest_0.index]

100%|██████████| 578/578 [01:30<00:00,  6.41it/s]


In [3]:
# verify no polymers had error from polymerization
assert dftrain[dftrain.smiles_polymer.str.contains('ERR')].shape[0]==0
assert dftest[dftest.smiles_polymer.str.contains('ERR')].shape[0]==0

# save
dftrain.to_csv('../data/example_polymer_smiles_train.csv')
dftest.to_csv('../data/example_polymer_smiles_test.csv')