In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, Draw
from rdkit.Chem.Draw import rdDepictor, rdMolDraw2D
import mols2grid
import random

opts = Draw.DrawingOptions()

In [2]:
##### Function Definitions #####

### Determine Active Tags In Group Coulumn (WuXi Specific)
### Input: Data frame
### Output: List of "reactive groups"
def col_active(dfa):
    col = []
    for i in range(0, len(dfa)):
        a = dfa.iloc[i]['group']
        b = a.replace(" ", "")
        c = b.split(',')
        #print(a, b, c)
        for j in c:
            if j not in col:
                col.append(j)
        #print(i,col)
    return col

### Select Random Subset of Molecules
### Input: Data Frame with at least one identifier and SMILES, number of molecules to be picked
### Output: Data frame with Identifier, SMILES, and selected molecules
def mol_random(df, mid, smiles, n):
    m = pd.DataFrame(df, columns =[mid, smiles])
    if n > len(m):
        n = len(m)
    a = random.sample(range(0, len(m)), n)
    moldf = m.iloc[a]
    #print(moldf)
    return(moldf)

### Remove a functional group in SMILES
### Input: SMILES of molecule and SMILES of group to be removed
### Output: SMILES of molecule with group removed
def remove_groups_in_smiles(smiles, fragment):
    core_mol = Chem.MolFromSmiles(smiles)
    pattern_mol = Chem.MolFromSmiles(fragment)  
    products = Chem.DeleteSubstructs(core_mol, pattern_mol)
    replaced_smiles = Chem.CanonSmiles(Chem.MolToSmiles(products))

    return replaced_smiles



### Replacing an atom with a different functional group in SMILES
def replace_groups_in_smiles_HP(smiles, fragment, replace_with):
    core_mol = Chem.MolFromSmiles(smiles)
    pattern_mol = Chem.MolFromSmiles(fragment)
    replacement_mol = Chem.MolFromSmiles(replace_with)
    
    products = AllChem.ReplaceSubstructs(core_mol, pattern_mol, replacement_mol)
    replaced_smiles = Chem.MolToSmiles(products[0])

    return replaced_smiles

### Replacing an atom with a different functional group in SMILES
def replace_groups_in_smiles(smiles, group_fragment, fgroup_smiles, replace_with):
    core_mol = Chem.MolFromSmiles(smiles)
    pattern_mol = Chem.MolFromSmiles(group_fragment)
    replacement_mol = Chem.MolFromSmiles(replace_with)
    
    products = AllChem.ReplaceSubstructs(core_mol, pattern_mol, replacement_mol)
    replaced_smiles = Chem.MolToSmiles(products[0])

    return replaced_smiles



In [3]:
### Read Wuxi Buildimg Block EXCEL (all sheets)
### Drop Duplicate Rows in Data Frame
### Removal List

rem_list = ['MFCD00009801']

df_wuxi_raw = pd.concat(pd.read_excel('../Available building blocks.xlsx', sheet_name = None), ignore_index = True)

#print(df_wuxi_raw)

a = df_wuxi_raw[df_wuxi_raw['solid_stock(mg)'] >= 100]
#drop rows that contain any removal items
b = a[a['MFCD'].isin(rem_list) == False]

df_wuxi = b.drop_duplicates()

print(df_wuxi)

             MW            group          MFCD                    CAS  \
0      200.0753        ArBr, NH2  MFCD00008189             73918-56-6   
1      113.1209              NH2  MFCD28118594            107269-68-1   
2      144.2141              NH2  MFCD00006184               123-00-2   
3      125.1712              NH2  MFCD00009819              5036-48-6   
4      113.2001              NH2  MFCD00001495  33483-65-7, 6321-23-9   
...         ...              ...           ...                    ...   
21636  321.3706  COOH, BocN, C=C    YYL-33-013                    NaN   
21638  293.3176  COOH, BocN, C=C    YYL-33-008                    NaN   
21639  265.3075        CHO, BocN  MFCD18792286           1251000-44-8   
21640  285.3352       COOH, BocN  MFCD28396327                    NaN   
21641  281.3069       COOH, BocN  MFCD28992185                    NaN   

                                                  SMILES  solid_stock(mg)  
0                                     NCCC1=CC=

In [4]:
### Expand Reagent List to Columns

col_wuxi = col_active(df_wuxi)
print(col_wuxi)

for i in col_wuxi:
    df_wuxi[i] = 0

print(df_wuxi)

['ArBr', 'NH2', 'BocN', 'COOMe', 'OH', 'COOH', 'FmocN', 'C≡C', 'ArCl', 'C=C', 'X', 'CbzN', 'COOEt', 'COOtBu', 'B(OH)2', 'ArNO2', 'NH', 'RCOR', 'CHO', 'ArI', 'ArF', 'NO2', 'ArNH', 'ArOH', 'CN', 'Bpin', 'COOiPr', 'B(OCR)2', 'NH2COOH', 'BF3K', 'ArNH2', 'SO2Cl', 'N3', 'SO2F', 'CONH', 'SO2Na', 'CF3CH2O', 'NCO', 'BrCH2CO', 'AllocN']
             MW            group          MFCD                    CAS  \
0      200.0753        ArBr, NH2  MFCD00008189             73918-56-6   
1      113.1209              NH2  MFCD28118594            107269-68-1   
2      144.2141              NH2  MFCD00006184               123-00-2   
3      125.1712              NH2  MFCD00009819              5036-48-6   
4      113.2001              NH2  MFCD00001495  33483-65-7, 6321-23-9   
...         ...              ...           ...                    ...   
21636  321.3706  COOH, BocN, C=C    YYL-33-013                    NaN   
21638  293.3176  COOH, BocN, C=C    YYL-33-008                    NaN   
21639  265.307

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [5]:
### Enter Reactive Groups into Column matrix

for i in range(0, len(df_wuxi)):
    a = df_wuxi.iloc[i]['group']
    b = a.replace(" ", "")
    c = b.split(',')
    d = df_wuxi.index[i]
    #print(d, c)
    for j in c:
        #print(d, j)
        df_wuxi.loc[d,j] = 1
    #print(df_wuxi.iloc[i])

print(df_wuxi)

             MW            group          MFCD                    CAS  \
0      200.0753        ArBr, NH2  MFCD00008189             73918-56-6   
1      113.1209              NH2  MFCD28118594            107269-68-1   
2      144.2141              NH2  MFCD00006184               123-00-2   
3      125.1712              NH2  MFCD00009819              5036-48-6   
4      113.2001              NH2  MFCD00001495  33483-65-7, 6321-23-9   
...         ...              ...           ...                    ...   
21636  321.3706  COOH, BocN, C=C    YYL-33-013                    NaN   
21638  293.3176  COOH, BocN, C=C    YYL-33-008                    NaN   
21639  265.3075        CHO, BocN  MFCD18792286           1251000-44-8   
21640  285.3352       COOH, BocN  MFCD28396327                    NaN   
21641  281.3069       COOH, BocN  MFCD28992185                    NaN   

                                                  SMILES  solid_stock(mg)  \
0                                     NCCC1=CC

In [6]:
### Reductive Alkylation on Resin

df_red_amin = df_wuxi[(df_wuxi['NH2'] == 1) 
                     & (df_wuxi['B(OH)2'] == 0) & (df_wuxi['Bpin'] == 0) & (df_wuxi['NH'] == 0)  
                     & (df_wuxi['N3'] == 0) & (df_wuxi['CHO'] == 0) & (df_wuxi['COOH'] == 0) & (df_wuxi['FmocN'] == 0)
                     & (df_wuxi['ArNH'] == 0) & (df_wuxi['COOtBu'] == 0)
                     & (df_wuxi['OH'] == 0)] 
#& (df_wuxi['BocN'] == 1) & (df_wuxi['NH'] == 0) 
                       #  & (df_wuxi['NH2'] == 0) & (df_wuxi['ArNH'] == 0) 
                       #  ]


print(col_active(df_red_amin))
print()
print(df_red_amin)

['ArBr', 'NH2', 'BocN', 'COOMe', 'C≡C', 'ArCl', 'C=C', 'X', 'CbzN', 'COOEt', 'ArNO2', 'RCOR', 'ArI', 'ArF', 'NO2', 'ArOH', 'CN', 'COOiPr']

            MW      group          MFCD                    CAS  \
0     200.0753  ArBr, NH2  MFCD00008189             73918-56-6   
1     113.1209        NH2  MFCD28118594            107269-68-1   
2     144.2141        NH2  MFCD00006184               123-00-2   
3     125.1712        NH2  MFCD00009819              5036-48-6   
4     113.2001        NH2  MFCD00001495  33483-65-7, 6321-23-9   
...        ...        ...           ...                    ...   
1950  179.2152        NH2  MFCD26097023           1439900-28-3   
1951  167.2227        NH2  MFCD12757052            321840-52-2   
1952  153.2242        NH2  MFCD20231586           1334492-60-2   
1953  266.1333  ArBr, NH2  MFCD12824528                      0   
1954  122.1673        NH2  MFCD00006367              2706-56-1   

                        SMILES  solid_stock(mg)  ArBr  NH2  BocN  C

In [7]:
### Suzuki Attached to Resin

df_suzuki_in = df_wuxi[(df_wuxi['CHO'] == 1) & (df_wuxi['ArBr'] == 1) 
                       & (df_wuxi['COOH'] == 0)]

print(col_active(df_suzuki_in))
print()
print(df_suzuki_in)

['CHO', 'ArBr', 'ArNO2', 'OH', 'ArNH2', 'ArCl', 'ArF', 'COOMe', 'COOEt', 'NH2', 'ArNH', 'BocN']

             MW                  group          MFCD           CAS  \
11272  220.9988              CHO, ArBr  MFCD08282773    57329-38-1   
11273  203.0083              CHO, ArBr  MFCD00070755  1306606-90-5   
11277  235.0764              CHO, ArBr  MFCD11101026   138505-25-6   
11296  263.9139        CHO, ArBr, ArBr  MFCD00156887  1018828-16-4   
11308  230.0154       CHO, ArBr, ArNO2  MFCD00462865    59142-68-6   
...         ...                    ...           ...           ...   
15650  219.4629        CHO, ArBr, ArCl  MFCD12024585  1197050-28-4   
17318  248.0059  CHO, ArBr, ArNO2, ArF  MFCD10758067   679839-39-5   
17555  248.0059  CHO, ArBr, ArNO2, ArF  MFCD28739866   213382-40-2   
21353  381.2238        CHO, BocN, ArBr  WXCD00132230           NaN   
21363  381.2634        CHO, BocN, ArBr  WXCD00131861           NaN   

                                                  SMILES  soli

In [24]:
### Suzuki into Resin

df_suzuki_out = df_wuxi[((df_wuxi['B(OH)2'] == 1) | (df_wuxi['Bpin'] == 1)) 
                        & ((df_wuxi['ArNH2'] == 1) |  (df_wuxi['NH2'] == 1) | (df_wuxi['BocN'] == 1) | (df_wuxi['NH'] == 1))
                        & (df_wuxi['COOH'] == 0) & (df_wuxi['ArBr'] == 0) 
                        & (df_wuxi['C=C'] == 0)]

print(col_active(df_suzuki_out))
print()
print(df_suzuki_out)

['B(OH)2', 'NH2', 'Bpin', 'NH', 'ArCl', 'ArNH2', 'BocN', 'COOMe', 'ArNH']

             MW             group          MFCD           CAS  \
212    168.9609       B(OH)2, NH2  MFCD04115666   850568-03-5   
808    164.9969       NH2, B(OH)2  MFCD01075172    76410-58-7   
1007   233.1136         Bpin, NH2  MFCD02179455   850568-55-7   
1117   150.9704       B(OH)2, NH2  MFCD01632199    75705-21-4   
2505   289.2196          Bpin, NH  MFCD12546203  1628014-71-0   
...         ...               ...           ...           ...   
20667  363.2583        Bpin, BocN  MFCD18383284   877399-31-0   
20974  359.2663        Bpin, BocN  MFCD11044669   893566-72-8   
21371  439.3741        Bpin, BocN  MFCD30469041   587871-47-4   
21452  345.2398        BocN, Bpin  MFCD18383572  1235451-62-3   
21476  362.2702  ArNH, Bpin, BocN  MFCD30186059  1627580-07-7   

                                                  SMILES  solid_stock(mg)  \
212                                 NCc1c(B(O)O)cc(F)cc1          1

In [9]:
### Amino Acid Extension After Suzuki

df_amid_extend = df_wuxi[(df_wuxi['COOH'] == 1) & (df_wuxi['BocN'] == 1) 
                         & (df_wuxi['NH'] == 0) & (df_wuxi['COOtBu'] == 0) 
                         & (df_wuxi['NH2'] == 0) & (df_wuxi['ArNH'] == 0) 
                         & (df_wuxi['B(OH)2'] == 0) & (df_wuxi['Bpin'] == 0)]


print(col_active(df_amid_extend))
print()
print(df_amid_extend)

['COOH', 'CHO', 'BocN', 'ArCl', 'ArF', 'FmocN', 'ArBr', 'OH', 'C=C', 'C≡C', 'COOMe', 'COOEt', 'ArI', 'ArNO2', 'RCOR', 'AllocN']

             MW             group          MFCD           CAS  \
13768  440.4235   COOH, CHO, BocN  WXCD00135449           NaN   
13936  271.6963  COOH, BocN, ArCl  MFCD02682156   136290-47-6   
13942  271.6963  COOH, BocN, ArCl  MFCD02682158   253677-29-1   
14062  299.7493  COOH, BocN, ArCl  MFCD03427921   500770-74-1   
14064  299.7493  COOH, BocN, ArCl  MFCD02090709   284493-66-9   
...         ...               ...           ...           ...   
21635  278.3030        COOH, BocN  MFCD18792222  1250999-71-3   
21636  321.3706   COOH, BocN, C=C    YYL-33-013           NaN   
21638  293.3176   COOH, BocN, C=C    YYL-33-008           NaN   
21640  285.3352        COOH, BocN  MFCD28396327           NaN   
21641  281.3069        COOH, BocN  MFCD28992185           NaN   

                                                  SMILES  solid_stock(mg)  \
13768  CC(C)(

In [10]:
### Final Capping (Acylation, Sulfonylation, Urea)

df_urea = df_wuxi[(df_wuxi['NCO'] == 1)] 
                  #& df_wuxi['NH2'] == 0) 
                  #   & (df_wuxi['B(OH)2'] == 0) & (df_wuxi['Bpin'] == 0) & (df_wuxi['NH'] == 0)  
                  #   & (df_wuxi['N3'] == 0) & (df_wuxi['CHO'] == 0)
                  #   & (df_wuxi['ArNH'] == 0) & (df_wuxi['COOtBu'] == 0)] 
#& (df_wuxi['BocN'] == 1) & (df_wuxi['NH'] == 0) 
                       #  & (df_wuxi['NH2'] == 0) & (df_wuxi['ArNH'] == 0) 
                       #  ]


print(col_active(df_urea))
print()
print(df_urea)

df_acyl = df_wuxi[(df_wuxi['COOH'] == 1) 
                  & (df_wuxi['NH2'] == 0) & (df_wuxi['NH'] == 0) 
                  & (df_wuxi['B(OH)2'] == 0) & (df_wuxi['Bpin'] == 0) & (df_wuxi['NH'] == 0)  
                  & (df_wuxi['N3'] == 0) & (df_wuxi['CHO'] == 0)
                  & (df_wuxi['ArNH'] == 0) & (df_wuxi['BF3K'] == 0) 
                  & (df_wuxi['SO2F'] == 0) & (df_wuxi['SO2Cl'] == 0)
                  & (df_wuxi['FmocN'] == 0) & (df_wuxi['C=C'] == 0)
                    ]


print(col_active(df_acyl))
print()
print(df_acyl)

df_sulf = df_wuxi[(df_wuxi['SO2Cl'] == 1) 
                  & (df_wuxi['NH2'] == 0) & (df_wuxi['NH'] == 0) 
                  & (df_wuxi['B(OH)2'] == 0) & (df_wuxi['Bpin'] == 0) & (df_wuxi['NH'] == 0)  
                  & (df_wuxi['N3'] == 0) & (df_wuxi['CHO'] == 0)
                  & (df_wuxi['ArNH'] == 0)
                  #& (df_wuxi['BocN'] == 1) 
                  & (df_wuxi['NH2'] == 0) #& (df_wuxi['ArNH'] == 0) 
                    ]


print(col_active(df_sulf))
print()
print(df_sulf)

['NCO', 'ArCl']

             MW                  group          MFCD          CAS  \
13970  188.0107        NCO, ArCl, ArCl  MFCD00002002    5392-82-5   
13972  221.5636              ArCl, NCO  MFCD00037029   50528-86-4   
14381  183.5915              ArCl, NCO  MFCD02093651   28395-76-8   
14398  221.5636              ArCl, NCO  MFCD00013874     327-78-6   
14588  167.5921              ArCl, NCO  MFCD00037036   40397-90-8   
14590  167.5921              ArCl, NCO  MFCD00019914   37408-18-7   
14592  222.4558  NCO, ArCl, ArCl, ArCl  MFCD00013845    2505-31-9   
14594  171.5561              ArCl, NCO  MFCD00037037   50529-33-4   
14595  171.5561              ArCl, NCO  MFCD03701597   69922-25-4   
14597  213.6174              ArCl, NCO  MFCD00013857   55440-55-6   
14605  221.5636              ArCl, NCO  MFCD00013873   16588-69-5   
14612  188.0107        NCO, ArCl, ArCl  MFCD00002003   39920-37-1   
14616  167.5921              ArCl, NCO  MFCD00037030   40398-01-4   
14618  167.5921  

In [11]:
### Library Size

del_size = len(df_suzuki_out)*len(df_suzuki_in) #len(df_amid_extend)**len(df_red_amin)
print(del_size)

18810


In [12]:
mol_disp = mol_random(df_red_amin, 'MFCD', 'SMILES', 1)
red_amin = mol_disp

mols2grid.display(mol_disp,
                 # RDKit's MolDrawOptions parameters
                  fixedBondLength=30,
                  # rename fields for the output document
                  subset=['MFCD', 'img', 'SMILES'],
                  tooltip=['SMILES'],
                  MolDrawingOptions=opts,
                  hover_color = 'blue',
                  n_cols = 6
                )



In [13]:
mol_disp = mol_random(df_suzuki_in, 'MFCD', 'SMILES', 150)
suzuki_in = mol_disp

mols2grid.display(mol_disp,
                 # RDKit's MolDrawOptions parameters
                  fixedBondLength=30,
                  # rename fields for the output document
                  subset=['MFCD', 'img', 'SMILES'],
                  tooltip=['SMILES'],
                  MolDrawingOptions=opts,
                  hover_color = 'blue',
                  n_cols = 6
                )



In [26]:
### Remove Duplicate between Boric Acid and Boric Pinacolate
#borac = Chem.MolFromSmarts('B(O)O')
bpin = Chem.MolFromSmarts('B1OC(C)(C)C(C)(C)O1')
boc = 'C(=O)C(C)(C)C'
#b1 = 'B(O)O'
#b2 = 'OB(O)'
s_temp = df_suzuki_out
s_temp['bpin'] = 0
s_temp['n_smi'] = ''

#print(s_temp)

for i in range(0, len(s_temp)):
    a = s_temp.iloc[i]['SMILES']
    a = remove_groups_in_smiles(a, boc)
    m = Chem.MolFromSmiles(a)
    b = m.GetSubstructMatches(bpin)
    d = s_temp.index[i]
    #print(i, a, b, len(b))
    if len(b) != 0:
        s_temp.loc[d,'bpin'] = 1
        m_new = remove_groups_in_smiles(a, 'B1OC(C)(C)C(C)(C)O1')
        #print(a, m_new)
        s_temp.loc[d,'n_smi'] = m_new
    else:
        m_new = remove_groups_in_smiles(a, 'B(O)O')
        #print(a, m_new)
        s_temp.loc[d,'n_smi'] = m_new

a = s_temp.sort_values(by=['n_smi','bpin'])
s_temp = a.drop_duplicates(subset='n_smi', keep="first")
#print(s_temp)
#print()
#print(len(s_temp))


### Now select Suzuki set

mol_disp = mol_random(s_temp, 'MFCD', 'SMILES', 100)
suzuki_out = mol_disp


mols2grid.display(mol_disp,
                 # RDKit's MolDrawOptions parameters
                  fixedBondLength=30,
                  # rename fields for the output document
                  subset=['MFCD', 'img', 'SMILES'],
                  tooltip=['SMILES'],
                  MolDrawingOptions=opts,
                  hover_color = 'blue',
                  n_cols = 6
                )



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [15]:
mol_disp = mol_random(df_amid_extend, 'MFCD', 'SMILES', 1)
amid_extend = mol_disp

mols2grid.display(mol_disp,
                 # RDKit's MolDrawOptions parameters
                  fixedBondLength=30,
                  # rename fields for the output document
                  subset=['MFCD', 'img', 'SMILES'],
                  tooltip=['SMILES'],
                  MolDrawingOptions=opts,
                  hover_color = 'blue',
                  n_cols = 6
                )



In [16]:
mol_disp = mol_random(df_urea, 'MFCD', 'SMILES', 50)
urea = mol_disp

mols2grid.display(mol_disp,
                 # RDKit's MolDrawOptions parameters
                  fixedBondLength=30,
                  # rename fields for the output document
                  subset=['MFCD', 'img', 'SMILES'],
                  tooltip=['SMILES'],
                  MolDrawingOptions=opts,
                  hover_color = 'blue',
                  n_cols = 6
                )

In [17]:
mol_disp = mol_random(df_acyl, 'MFCD', 'SMILES', 150)
acyl = mol_disp

mols2grid.display(mol_disp,
                 # RDKit's MolDrawOptions parameters
                  fixedBondLength=30,
                  # rename fields for the output document
                  subset=['MFCD', 'img', 'SMILES'],
                  tooltip=['SMILES'],
                  MolDrawingOptions=opts,
                  hover_color = 'blue',
                  n_cols = 6
                )

In [28]:
mol_disp = mol_random(df_sulf, 'MFCD', 'SMILES', 50)
sulf = mol_disp

mols2grid.display(mol_disp,
                 # RDKit's MolDrawOptions parameters
                  fixedBondLength=30,
                  # rename fields for the output document
                  subset=['MFCD', 'img', 'SMILES'],
                  tooltip=['SMILES'],
                  MolDrawingOptions=opts,
                  hover_color = 'blue',
                  n_cols = 6
                )

In [27]:
# Create a Pandas Excel writer using XlsxWriter as the engine.
xls_out = '../Wuxi_DEL.xlsx'

# Write each dataframe to a different worksheet.
with pd.ExcelWriter(xls_out) as writer:
    #red_amin.to_excel(writer, sheet_name='1_Reductive_Amination', index = False)
    suzuki_in.to_excel(writer, sheet_name='2_Reductive_Alkylation', index = False)
    suzuki_out.to_excel(writer, sheet_name='3_Suzuki', index = False)
    #amid_extend.to_excel(writer, sheet_name='4_Amidation', index = False)
    urea.to_excel(writer, sheet_name='4_Capping_Urea', index = False)
    acyl.to_excel(writer, sheet_name='4_Capping_Acyl', index = False)
    sulf.to_excel(writer, sheet_name='4_Capping_Sulfonyl', index = False)

# Close the Pandas Excel writer and output the Excel file.
#writer.save()

In [20]:
### Replacing an atom with a different functional group in SMILES
def replace_groups_in_smiles(smiles, group_fragment, fgroup_smiles, replace_with):
    core_mol = Chem.MolFromSmiles(smiles)
    pattern_mol = Chem.MolFromSmiles(group_fragment)
    replacement_mol = Chem.MolFromSmiles(replace_with)
    
    products = AllChem.ReplaceSubstructs(core_mol, pattern_mol, replacement_mol)
    replaced_smiles = Chem.MolToSmiles(products[0])

    return replaced_smiles
