# SMILES Augmented Data Generation

The core dataset used for these tests was generated by [Liu et al](https://pubs.acs.org/doi/10.1021/acscentsci.7b00303) and is available at [here](https://github.com/pandegroup/reaction_prediction_seq2seq/tree/master/processed_data).

The function used for SMILES enumeration was taken from [this repo](https://github.com/EBjerrum/SMILES-enumeration) with minor adaptations for Python3 compatibility.

The code below generates three augmented datasets with 4x, 16x and 40x augmentation over the original dataset.

The the baseline dataset and the generated augmented datasets can be downloaded [here](https://www.dropbox.com/s/ze4bdif8sqjx5jx/Retrosynthesis%20Data.zip?dl=0)

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from SmilesEnumerator import SmilesEnumerator
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
import pandas as pd
import random

## Small Augmentation

This script generated four versions of each input datapoint. Specifically, one augmented source sequence and one augmented target sequence will be generated and combined with the un-augmented source and target sequences:

`source + target
source_augmented + target
source + target_augmented
source_augmented + target_augmented`

In [3]:
path = Path('./USPTO-50K')

In [4]:
class SmallAug():
    def __init__(self, path):
        train_source = list(open(path/'src/src-train.txt'))
        train_targs = list(open(path/'tgt/tgt-train.txt'))
        self.data = list(zip(train_source, train_targs))
        
    def augment(self, save_folder_path):
        self.generate_augs(self.data)
        self.save_df(save_folder_path)
        
    def generate_augs(self, data):
        with ThreadPoolExecutor(8) as ex:
            new_data = ex.map(lambda x: self.augment_rxn(x), data)
            
        aug_data = list(new_data)
        self.df = pd.DataFrame(columns=['Source', 'Target', 'rxn_number'])
        
        for i in range(len(aug_data)):
            df_i = pd.DataFrame(aug_data[i], columns=['Source', 'Target'])
            df_i['rxn_number'] = i
            self.df = self.df.append(df_i)
            
        self.df.reset_index(inplace=True, drop=True)
        
    def save_df(self, path):
        self.df.to_csv(path/'augmented_df.csv', index=False)
        
        sources_aug = list(self.df.Source.values)
        with open(path/'train_sources_augmented.txt', 'w') as f:
            for sa in sources_aug:
                rxn, smile = sa.split(' ')
                smile_tok = ' '.join([i for i in smile])
                f.write(rxn + ' ' + smile_tok + '\n')
                
        targets_aug = list(self.df.Target.values)
        with open(path/'train_targets_augmented.txt', 'w') as f:
            for ta in targets_aug:
                smile_tok = ' '.join([i for i in ta])
                f.write(smile_tok + '\n')
                
        
    def augment_rxn(self, data):
        source = data[0].strip('\n')
        targ = data[1].strip('\n')

        sme = SmilesEnumerator()
        new_data = []

        rxn_class = source.split(' ')[0]

        source_smile = ''.join(source.split('> ')[1].split(' '))
        targ_smile = ''.join(targ.split(' '))

        source_aug = rxn_class + ' ' + sme.randomize_smiles(source_smile)
        source_smile = rxn_class + ' ' + source_smile
        targ_aug = sme.randomize_smiles(targ_smile)

        new_data = [[source_smile, targ_smile],
                    [source_aug, targ_smile],
                    [source_aug, targ_aug],
                    [source_smile, targ_aug]]

        return new_data

In [5]:
sa = SmallAug(path)

In [7]:
sa.augment(path/'augmenteed_data_small')

## Medium Augmentation

The logic here is similar to above. This time we pass an `n_augs` parameter that controls the number of augmented datapoints generated. The class generates `n_augs` augmented datapoints plus the original datapoint for a total of `n_augs + 1` datapoints.

For medium augmentation we use `n_augs = 15`

In [8]:
class MediumAug():
    def __init__(self, path, n_augs):
        train_source = list(open(path/'processed_data/train_sources'))
        train_targs = list(open(path/'processed_data/train_targets'))
        self.data = list(zip(train_source, train_targs))
        self.n_augs = n_augs
        
    def augment(self, save_folder_path):
        self.generate_augs(self.data)
        self.save_df(save_folder_path)
        
    def generate_augs(self, data):
        with ThreadPoolExecutor(8) as ex:
            new_data = ex.map(lambda x: self.augment_rxn(x), data)
            
        aug_data = list(new_data)
        self.df = pd.DataFrame(columns=['Source', 'Target', 'rxn_number'])
        
        for i in range(len(aug_data)):
            df_i = pd.DataFrame(aug_data[i], columns=['Source', 'Target'])
            df_i['rxn_number'] = i
            self.df = self.df.append(df_i)
            
        self.df.reset_index(inplace=True, drop=True)
        
    def save_df(self, path):
        self.df.to_csv(path/'augmented_df.csv', index=False)
        
        sources_aug = list(self.df.Source.values)
        with open(path/'train_sources_augmented.txt', 'w') as f:
            for sa in sources_aug:
                rxn, smile = sa.split(' ')
                smile_tok = ' '.join([i for i in smile])
                f.write(rxn + ' ' + smile_tok + '\n')
                
        targets_aug = list(self.df.Target.values)
        with open(path/'train_targets_augmented.txt', 'w') as f:
            for ta in targets_aug:
                smile_tok = ' '.join([i for i in ta])
                f.write(smile_tok + '\n')

    
    def augment_rxn(self, data):
        source = data[0].strip('\n')
        targ = data[1].strip('\n')

        sme = SmilesEnumerator()
        new_data = []

        rxn_class = source.split(' ')[0]

        source_smile = ''.join(source.split('> ')[1].split(' '))
        targ_smile = ''.join(targ.split(' '))

        source_aug = [rxn_class + ' ' + sme.randomize_smiles(source_smile) for i in range(self.n_augs)]
        source_aug += [rxn_class + ' ' + source_smile]

        targ_aug = [sme.randomize_smiles(targ_smile) for i in range(15)]
        targ_aug += [targ_smile]

        new_data = [[s,t] for s,t in zip(source_aug, targ_aug)]

        return new_data

In [9]:
ma = MediumAug(path, 15)

In [10]:
ma.augment(path/'augmenteed_data_medium')

## Large Augmentation

Larger scale augmentation requires a little extra logic. When generating a large number of augmented sequences randomly, there is a good chance the same augmented variant will appear more than once. This class generates `4 * n_aug` augmented variants, reduces the set of augmented SMILES to a unique set, then pulls `n_aug` samples from the unique set.

For large augmentation, we use `n_augs = 40`

In [11]:
class LargeAug():
    def __init__(self, path, n_augs):
        train_source = list(open(path/'processed_data/train_sources'))
        train_targs = list(open(path/'processed_data/train_targets'))
        self.data = list(zip(train_source, train_targs))
        self.n_augs = n_augs
        
    def augment(self, save_folder_path):
        self.generate_augs(self.data)
        self.save_df(save_folder_path)
        
    def generate_augs(self, data):
        with ThreadPoolExecutor(8) as ex:
            new_data = ex.map(lambda x: self.augment_rxn(x), data)
            
        aug_data = list(new_data)
        self.df = pd.DataFrame(columns=['Source', 'Target', 'rxn_number'])
        
        for i in range(len(aug_data)):
            df_i = pd.DataFrame(aug_data[i], columns=['Source', 'Target'])
            df_i['rxn_number'] = i
            self.df = self.df.append(df_i)
            
        self.df.reset_index(inplace=True, drop=True)
        
    def save_df(self, path):
        self.df.to_csv(path/'augmented_df.csv', index=False)
        
        sources_aug = list(self.df.Source.values)
        with open(path/'train_sources_augmented.txt', 'w') as f:
            for sa in sources_aug:
                rxn, smile = sa.split(' ')
                smile_tok = ' '.join([i for i in smile])
                f.write(rxn + ' ' + smile_tok + '\n')
                
        targets_aug = list(self.df.Target.values)
        with open(path/'train_targets_augmented.txt', 'w') as f:
            for ta in targets_aug:
                smile_tok = ' '.join([i for i in ta])
                f.write(smile_tok + '\n')

    
    def augment_rxn(self, data):
        source = data[0].strip('\n')
        targ = data[1].strip('\n')
        augs = self.n_augs * 4

        sme = SmilesEnumerator()
        new_data = []

        rxn_class = source.split(' ')[0]

        source_smile = ''.join(source.split('> ')[1].split(' '))
        targ_smile = ''.join(targ.split(' '))

        source_aug = list(set([sme.randomize_smiles(source_smile) for i in range(augs)]))

        if len(source_aug) > self.n_augs:
            source_aug = random.sample(source_aug, self.n_augs)

        targ_aug = list(set([sme.randomize_smiles(targ_smile) for i in range(augs)]))

        if len(targ_aug) > self.n_augs:
            targ_aug = random.sample(targ_aug, self.n_augs)

        source_aug = [rxn_class + ' ' + i for i in source_aug]

        for s, t in zip(source_aug, targ_aug):
            new_data.append([s,t])

        return new_data

In [12]:
la = LargeAug(path, 40)

In [13]:
la.augment(path/'augmenteed_data_large')