In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, Draw
from rdkit.Chem.Draw import rdDepictor, rdMolDraw2D
import mols2grid
import random

opts = Draw.DrawingOptions()

In [2]:
##### Function Definitions #####

### Determine Active Tags In Group Coulumn (WuXi Specific)
### Input: Data frame
### Output: List of "reactive groups"
def col_active(dfa):
    col = []
    for i in range(0, len(dfa)):
        a = dfa.iloc[i]['group']
        b = a.replace(" ", "")
        c = b.split(',')
        #print(a, b, c)
        for j in c:
            if j not in col:
                col.append(j)
        #print(i,col)
    return col

### Select Random Subset of Molecules
### Input: Data Frame with at least one identifier and SMILES, number of molecules to be picked
### Output: Data frame with Identifier, SMILES, and selected molecules
def mol_random(df, mid, smiles, n):
    m = pd.DataFrame(df, columns =[mid, smiles])
    if n > len(m):
        n = len(m)
    a = random.sample(range(0, len(m)), n)
    moldf = m.iloc[a]
    #print(moldf)
    return(moldf)

### Select Random Subset of Molecules
### Input: Data Frame with at least one identifier and SMILES, number of molecules to be picked, and a list of preselects
### Output: Data frame with Identifier, SMILES, and selected molecules
def mol_random_preset(df, mid, smiles, selnum, fixlist):
    b = pd.DataFrame(df, columns =[mid, smiles])
    b['Preselect'] = 0
    c = b[b[mid].isin(fixlist) == True]
    m_in = c.drop_duplicates(subset= mid, keep="first")
    m_in['Preselect'] =1
    m = b[b[mid].isin(fixlist) == False]
    n = selnum - len(m_in)
    print(selnum, n, len(b), len(fixlist), len(m_in), len(m))
    if n > len(m):
        n = len(m)
    a = random.sample(range(0, len(m)), n)
    m_sel = m.iloc[a]
    moldf = pd.concat([m_in, m_sel])
    #print(moldf)
    return(moldf)

### Remove a functional group in SMILES
### Input: SMILES of molecule and SMILES of group to be removed
### Output: SMILES of molecule with group removed
def remove_groups_in_smiles(smiles, fragment):
    core_mol = Chem.MolFromSmiles(smiles)
    pattern_mol = Chem.MolFromSmiles(fragment)  
    products = Chem.DeleteSubstructs(core_mol, pattern_mol)
    replaced_smiles = Chem.CanonSmiles(Chem.MolToSmiles(products))

    return replaced_smiles



### Replacing an atom with a different functional group in SMILES
def replace_groups_in_smiles_HP(smiles, fragment, replace_with):
    core_mol = Chem.MolFromSmiles(smiles)
    pattern_mol = Chem.MolFromSmiles(fragment)
    replacement_mol = Chem.MolFromSmiles(replace_with)
    
    products = AllChem.ReplaceSubstructs(core_mol, pattern_mol, replacement_mol)
    replaced_smiles = Chem.MolToSmiles(products[0])

    return replaced_smiles

### Replacing an atom with a different functional group in SMILES
def replace_groups_in_smiles(smiles, group_fragment, fgroup_smiles, replace_with):
    core_mol = Chem.MolFromSmiles(smiles)
    pattern_mol = Chem.MolFromSmiles(group_fragment)
    replacement_mol = Chem.MolFromSmiles(replace_with)
    
    products = AllChem.ReplaceSubstructs(core_mol, pattern_mol, replacement_mol)
    replaced_smiles = Chem.MolToSmiles(products[0])

    return replaced_smiles

### Establish Keep Removal List

def wuxi_validation(df, cut, sel_field, id_field):
    keep_list = []
    remo_list = []
    for i in range(0, len(df)):
        a = df.loc[i][sel_field]
        #print(i, a)
        #print('Test')
        if df.iloc[i][sel_field] == '-':
            keep_list.append(df.iloc[i][id_field])
        elif df.iloc[i][sel_field] < cut:
            remo_list.append(df.iloc[i][id_field])
        else:
            keep_list.append(df.iloc[i][id_field])
    #print(remo_list, keep_list)
    return remo_list, keep_list

In [3]:
### Read Wuxi Validation List and Establish Removal / Keep Lists
num_sheets = 5
sheets = dict()

for i in range(num_sheets):
    sheets[i] = pd.read_excel('../Wuxi_DEL_1_v2 with validation yield.xlsx', sheet_name = i)
    
val_alkyl = sheets[0]
val_alkyl['React'] = 'Alkylation'
val_alkyl = val_alkyl[0:150]
#print(val_alkyl)
alk_rem, alk_keep = wuxi_validation(val_alkyl, 80, 'Yield %','MFCD')
print(alk_rem)
print(alk_keep)
print()

val_suzuki = sheets[1]
val_suzuki['React'] = 'Suzuki'
#print(val_suzuki)
suz_rem, suz_keep = wuxi_validation(val_suzuki, 80, 'Yield %','MFCD')
print(suz_rem)
print(suz_keep)
print()

val_urea = sheets[2]
val_urea['React'] = 'Capping'
#print(val_urea)
urea_rem, urea_keep = wuxi_validation(val_urea, 80, 'Yield %','MFCD')
print(urea_rem)
print(urea_keep)
print()

val_acid = sheets[3]
val_acid['React'] = 'Capping'
#print(val_acid)
acid_rem, acid_keep = wuxi_validation(val_acid, 80, 'Yield %','MFCD')
print(acid_rem)
print(acid_keep)
print()

val_sulf = sheets[4]
val_sulf['React'] = 'Capping'
#print(val_sulf)
sulf_rem, sulf_keep = wuxi_validation(val_sulf, 80, 'Yield %','MFCD')
print(sulf_rem)
print(sulf_keep)
print()

#df_wuxi_val = pd.concat([val_alkyl, val_suzuki,val_urea, val_acid, val_sulf])
#df_wuxi_val.to_csv('../Wuxi_DEL_1_v2 with validation yield.csv', index=False)


['MFCD30183460', 'MFCD11110636', 'MFCD11520666', 'MFCD27975543', 'MFCD21642010', 'MFCD04116319', 'MFCD00204031', 'MFCD08059506', 'MFCD11226919', 'MFCD09258953', 'MFCD00266792', 'MFCD19105357', 'MFCD06739238', 'WXCD00132229', 'MFCD16659662', 'MFCD13190304', 'MFCD16606522', 'MFCD09118354', 'MFCD25342423', 'WXCD00135490', 'MFCD19441928', 'MFCD09834665', 'MFCD18410725', 'MFCD16036864', 'MFCD09040530', 'MFCD09995135', 'MFCD12828175', 'MFCD09835094', 'MFCD19441929', 'MFCD00046154', 'MFCD05979167', 'MFCD06656559', 'MFCD09056792', 'MFCD09056792', 'MFCD09261041', 'MFCD00016599', 'MFCD02261728', 'MFCD13189844', 'MFCD08689713', 'MFCD08689713', 'MFCD30297264', 'MFCD02261727', 'MFCD18254323', 'MFCD12406886', 'MFCD13181606', 'MFCD22565644', 'MFCD09027520', 'MFCD30296953', 'MFCD06824354', 'MFCD11180106', 'MFCD11520882', 'WXCD00131421', 'MFCD00010863', 'MFCD06738308', 'MFCD00005432', 'MFCD03701029', 'MFCD20923630', 'MFCD10758067', 'MFCD01859870', 'MFCD26516940', 'MFCD25459888', 'MFCD24253734', 'MFCD18

In [4]:
### Read Wuxi Buildimg Block EXCEL (all sheets)
### Drop Duplicate Rows in Data Frame
### Removal List

rem_list = ['MFCD00009801']

df_wuxi_raw = pd.concat(pd.read_excel('../Available building blocks.xlsx', sheet_name = None), ignore_index = True)

#print(df_wuxi_raw)

a = df_wuxi_raw[df_wuxi_raw['solid_stock(mg)'] >= 100]
#drop rows that contain any removal items
b = a[a['MFCD'].isin(rem_list) == False]

df_wuxi = b.drop_duplicates()

print(df_wuxi)

             MW            group          MFCD                    CAS  \
0      200.0753        ArBr, NH2  MFCD00008189             73918-56-6   
1      113.1209              NH2  MFCD28118594            107269-68-1   
2      144.2141              NH2  MFCD00006184               123-00-2   
3      125.1712              NH2  MFCD00009819              5036-48-6   
4      113.2001              NH2  MFCD00001495  33483-65-7, 6321-23-9   
...         ...              ...           ...                    ...   
21636  321.3706  COOH, BocN, C=C    YYL-33-013                    NaN   
21638  293.3176  COOH, BocN, C=C    YYL-33-008                    NaN   
21639  265.3075        CHO, BocN  MFCD18792286           1251000-44-8   
21640  285.3352       COOH, BocN  MFCD28396327                    NaN   
21641  281.3069       COOH, BocN  MFCD28992185                    NaN   

                                                  SMILES  solid_stock(mg)  
0                                     NCCC1=CC=

In [5]:
### Expand Reagent List to Columns

col_wuxi = col_active(df_wuxi)
print(col_wuxi)

for i in col_wuxi:
    df_wuxi[i] = 0

print(df_wuxi)

['ArBr', 'NH2', 'BocN', 'COOMe', 'OH', 'COOH', 'FmocN', 'C≡C', 'ArCl', 'C=C', 'X', 'CbzN', 'COOEt', 'COOtBu', 'B(OH)2', 'ArNO2', 'NH', 'RCOR', 'CHO', 'ArI', 'ArF', 'NO2', 'ArNH', 'ArOH', 'CN', 'Bpin', 'COOiPr', 'B(OCR)2', 'NH2COOH', 'BF3K', 'ArNH2', 'SO2Cl', 'N3', 'SO2F', 'CONH', 'SO2Na', 'CF3CH2O', 'NCO', 'BrCH2CO', 'AllocN']
             MW            group          MFCD                    CAS  \
0      200.0753        ArBr, NH2  MFCD00008189             73918-56-6   
1      113.1209              NH2  MFCD28118594            107269-68-1   
2      144.2141              NH2  MFCD00006184               123-00-2   
3      125.1712              NH2  MFCD00009819              5036-48-6   
4      113.2001              NH2  MFCD00001495  33483-65-7, 6321-23-9   
...         ...              ...           ...                    ...   
21636  321.3706  COOH, BocN, C=C    YYL-33-013                    NaN   
21638  293.3176  COOH, BocN, C=C    YYL-33-008                    NaN   
21639  265.307

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [6]:
### Enter Reactive Groups into Column matrix

for i in range(0, len(df_wuxi)):
    a = df_wuxi.iloc[i]['group']
    b = a.replace(" ", "")
    c = b.split(',')
    d = df_wuxi.index[i]
    #print(d, c)
    for j in c:
        #print(d, j)
        df_wuxi.loc[d,j] = 1
    #print(df_wuxi.iloc[i])

print(df_wuxi)

             MW            group          MFCD                    CAS  \
0      200.0753        ArBr, NH2  MFCD00008189             73918-56-6   
1      113.1209              NH2  MFCD28118594            107269-68-1   
2      144.2141              NH2  MFCD00006184               123-00-2   
3      125.1712              NH2  MFCD00009819              5036-48-6   
4      113.2001              NH2  MFCD00001495  33483-65-7, 6321-23-9   
...         ...              ...           ...                    ...   
21636  321.3706  COOH, BocN, C=C    YYL-33-013                    NaN   
21638  293.3176  COOH, BocN, C=C    YYL-33-008                    NaN   
21639  265.3075        CHO, BocN  MFCD18792286           1251000-44-8   
21640  285.3352       COOH, BocN  MFCD28396327                    NaN   
21641  281.3069       COOH, BocN  MFCD28992185                    NaN   

                                                  SMILES  solid_stock(mg)  \
0                                     NCCC1=CC

In [7]:
### Reductive Alkylation on Resin

a = df_wuxi[(df_wuxi['NH2'] == 1) 
                     & (df_wuxi['B(OH)2'] == 0) & (df_wuxi['Bpin'] == 0) & (df_wuxi['NH'] == 0)  
                     & (df_wuxi['N3'] == 0) & (df_wuxi['CHO'] == 0) & (df_wuxi['COOH'] == 0) & (df_wuxi['FmocN'] == 0)
                     & (df_wuxi['ArNH'] == 0) & (df_wuxi['COOtBu'] == 0)
                     & (df_wuxi['OH'] == 0)] 
#& (df_wuxi['BocN'] == 1) & (df_wuxi['NH'] == 0) 
                       #  & (df_wuxi['NH2'] == 0) & (df_wuxi['ArNH'] == 0) 
                       #  ]

#remove non-validated
df_red_amin = a[a['MFCD'].isin(alk_rem) == False]
        
print(col_active(df_red_amin))
print()
print(a)
print(df_red_amin)

['ArBr', 'NH2', 'BocN', 'COOMe', 'C≡C', 'ArCl', 'C=C', 'X', 'CbzN', 'COOEt', 'ArNO2', 'RCOR', 'ArI', 'ArF', 'NO2', 'ArOH', 'CN', 'COOiPr']

            MW      group          MFCD                    CAS  \
0     200.0753  ArBr, NH2  MFCD00008189             73918-56-6   
1     113.1209        NH2  MFCD28118594            107269-68-1   
2     144.2141        NH2  MFCD00006184               123-00-2   
3     125.1712        NH2  MFCD00009819              5036-48-6   
4     113.2001        NH2  MFCD00001495  33483-65-7, 6321-23-9   
...        ...        ...           ...                    ...   
1950  179.2152        NH2  MFCD26097023           1439900-28-3   
1951  167.2227        NH2  MFCD12757052            321840-52-2   
1952  153.2242        NH2  MFCD20231586           1334492-60-2   
1953  266.1333  ArBr, NH2  MFCD12824528                      0   
1954  122.1673        NH2  MFCD00006367              2706-56-1   

                        SMILES  solid_stock(mg)  ArBr  NH2  BocN  C

In [8]:
### Suzuki Attached to Resin

a = df_wuxi[(df_wuxi['CHO'] == 1) & (df_wuxi['ArBr'] == 1) 
                       & (df_wuxi['COOH'] == 0)]

#remove non-validated
df_suzuki_in = a[a['MFCD'].isin(alk_rem) == False]

print(col_active(df_suzuki_in))
print()
#print(a)
print(df_suzuki_in)

['CHO', 'ArBr', 'ArNO2', 'OH', 'COOMe', 'ArCl', 'ArF', 'COOEt', 'NH2', 'ArNH', 'BocN']

             MW                  group          MFCD           CAS  \
11272  220.9988              CHO, ArBr  MFCD08282773    57329-38-1   
11273  203.0083              CHO, ArBr  MFCD00070755  1306606-90-5   
11277  235.0764              CHO, ArBr  MFCD11101026   138505-25-6   
11296  263.9139        CHO, ArBr, ArBr  MFCD00156887  1018828-16-4   
11308  230.0154       CHO, ArBr, ArNO2  MFCD00462865    59142-68-6   
...         ...                    ...           ...           ...   
15190  237.4534   CHO, ArBr, ArCl, ArF  MFCD13185877  1214386-29-4   
15650  219.4629        CHO, ArBr, ArCl  MFCD12024585  1197050-28-4   
17555  248.0059  CHO, ArBr, ArNO2, ArF  MFCD28739866   213382-40-2   
21353  381.2238        CHO, BocN, ArBr  WXCD00132230           NaN   
21363  381.2634        CHO, BocN, ArBr  WXCD00131861           NaN   

                                                  SMILES  solid_stock(m

In [9]:
### Suzuki into Resin

a = df_wuxi[((df_wuxi['B(OH)2'] == 1) | (df_wuxi['Bpin'] == 1)) 
                        & ((df_wuxi['ArNH2'] == 1) |  (df_wuxi['NH2'] == 1) | (df_wuxi['BocN'] == 1) | (df_wuxi['NH'] == 1))
                        & (df_wuxi['COOH'] == 0) & (df_wuxi['ArBr'] == 0) 
                        & (df_wuxi['C=C'] == 0)]


#remove non-validated
df_suzuki_out = a[a['MFCD'].isin(suz_rem) == False]

print(col_active(df_suzuki_out))
print()
print(df_suzuki_out)

['Bpin', 'NH2', 'B(OH)2', 'BocN', 'ArNH2']

             MW          group          MFCD           CAS  \
1007   233.1136      Bpin, NH2  MFCD02179455   850568-55-7   
19108  306.1642   B(OH)2, BocN  MFCD10696664   937048-39-0   
19110  306.1642   B(OH)2, BocN  MFCD10696663   457613-78-4   
19129  334.1743   BocN, B(OH)2  MFCD12025996  1150114-76-3   
19132  306.1642   B(OH)2, BocN  MFCD09753310   915770-01-3   
19139  257.1137   BocN, B(OH)2  MFCD11504845  1072951-39-3   
19407  251.0859   B(OH)2, BocN  MFCD04115637   489446-42-6   
19457  321.1788   B(OH)2, BocN  MFCD23701644  1379476-75-1   
19468  295.1416   B(OH)2, BocN  MFCD12913942  1190875-39-8   
19470  251.0859   B(OH)2, BocN  MFCD05663974   433969-27-8   
19476  137.9320  ArNH2, B(OH)2  MFCD23703516  1204112-62-8   
19673  335.2020   B(OH)2, BocN  MFCD28400568  1224449-13-1   
19825  439.3541     Bpin, BocN  MFCD28386938  1256387-87-7   
19851  402.3339     BocN, Bpin  MFCD16294502   936694-19-8   
19858  416.3175     BocN, 

In [10]:
### Amino Acid Extension After Suzuki

df_amid_extend = df_wuxi[(df_wuxi['COOH'] == 1) & (df_wuxi['BocN'] == 1) 
                         & (df_wuxi['NH'] == 0) & (df_wuxi['COOtBu'] == 0) 
                         & (df_wuxi['NH2'] == 0) & (df_wuxi['ArNH'] == 0) 
                         & (df_wuxi['B(OH)2'] == 0) & (df_wuxi['Bpin'] == 0)]


print(col_active(df_amid_extend))
print()
print(df_amid_extend)

['COOH', 'CHO', 'BocN', 'ArCl', 'ArF', 'FmocN', 'ArBr', 'OH', 'C=C', 'C≡C', 'COOMe', 'COOEt', 'ArI', 'ArNO2', 'RCOR', 'AllocN']

             MW             group          MFCD           CAS  \
13768  440.4235   COOH, CHO, BocN  WXCD00135449           NaN   
13936  271.6963  COOH, BocN, ArCl  MFCD02682156   136290-47-6   
13942  271.6963  COOH, BocN, ArCl  MFCD02682158   253677-29-1   
14062  299.7493  COOH, BocN, ArCl  MFCD03427921   500770-74-1   
14064  299.7493  COOH, BocN, ArCl  MFCD02090709   284493-66-9   
...         ...               ...           ...           ...   
21635  278.3030        COOH, BocN  MFCD18792222  1250999-71-3   
21636  321.3706   COOH, BocN, C=C    YYL-33-013           NaN   
21638  293.3176   COOH, BocN, C=C    YYL-33-008           NaN   
21640  285.3352        COOH, BocN  MFCD28396327           NaN   
21641  281.3069        COOH, BocN  MFCD28992185           NaN   

                                                  SMILES  solid_stock(mg)  \
13768  CC(C)(

In [11]:
### Final Capping (Acylation, Sulfonylation, Urea)

a = df_wuxi[(df_wuxi['NCO'] == 1)] 
                  #& df_wuxi['NH2'] == 0) 
                  #   & (df_wuxi['B(OH)2'] == 0) & (df_wuxi['Bpin'] == 0) & (df_wuxi['NH'] == 0)  
                  #   & (df_wuxi['N3'] == 0) & (df_wuxi['CHO'] == 0)
                  #   & (df_wuxi['ArNH'] == 0) & (df_wuxi['COOtBu'] == 0)] 
#& (df_wuxi['BocN'] == 1) & (df_wuxi['NH'] == 0) 
                       #  & (df_wuxi['NH2'] == 0) & (df_wuxi['ArNH'] == 0) 
                       #  ]

#remove non-validated
df_urea = a[a['MFCD'].isin(urea_rem) == False]

print(col_active(df_urea))
print()
print(df_urea)

a = df_wuxi[(df_wuxi['COOH'] == 1) 
                  & (df_wuxi['NH2'] == 0) & (df_wuxi['NH'] == 0) 
                  & (df_wuxi['B(OH)2'] == 0) & (df_wuxi['Bpin'] == 0) & (df_wuxi['NH'] == 0)  
                  & (df_wuxi['N3'] == 0) & (df_wuxi['CHO'] == 0)
                  & (df_wuxi['ArNH'] == 0) & (df_wuxi['BF3K'] == 0) 
                  & (df_wuxi['SO2F'] == 0) & (df_wuxi['SO2Cl'] == 0)
                  & (df_wuxi['FmocN'] == 0) & (df_wuxi['C=C'] == 0)]

#remove non-validated
df_acyl = a[a['MFCD'].isin(acid_rem) == False]

print(col_active(df_acyl))
print()
print(df_acyl)

a = df_wuxi[(df_wuxi['SO2Cl'] == 1) 
                  & (df_wuxi['NH2'] == 0) & (df_wuxi['NH'] == 0) 
                  & (df_wuxi['B(OH)2'] == 0) & (df_wuxi['Bpin'] == 0) & (df_wuxi['NH'] == 0)  
                  & (df_wuxi['N3'] == 0) & (df_wuxi['CHO'] == 0)
                  & (df_wuxi['ArNH'] == 0)
                  #& (df_wuxi['BocN'] == 1) 
                  & (df_wuxi['NH2'] == 0) #& (df_wuxi['ArNH'] == 0) 
                    ]
#remove non-validated
df_sulf = a[a['MFCD'].isin(sulf_rem) == False]


print(col_active(df_sulf))
print()
print(df_sulf)

['NCO', 'ArCl']

             MW                  group          MFCD         CAS  \
13970  188.0107        NCO, ArCl, ArCl  MFCD00002002   5392-82-5   
14588  167.5921              ArCl, NCO  MFCD00037036  40397-90-8   
14590  167.5921              ArCl, NCO  MFCD00019914  37408-18-7   
14592  222.4558  NCO, ArCl, ArCl, ArCl  MFCD00013845   2505-31-9   
14595  171.5561              ArCl, NCO  MFCD03701597  69922-25-4   
14597  213.6174              ArCl, NCO  MFCD00013857  55440-55-6   
14612  188.0107        NCO, ArCl, ArCl  MFCD00002003  39920-37-1   
14616  167.5921              ArCl, NCO  MFCD00037030  40398-01-4   
14618  167.5921              ArCl, NCO  MFCD01863693  40398-03-6   

                          SMILES  solid_stock(mg)  ArBr  NH2  BocN  COOMe  \
13970      O=C=Nc1c(Cl)ccc(Cl)c1          4998.74     0    0     0      0   
14588       O=C=Nc1c(C)c(Cl)ccc1           998.68     0    0     0      0   
14590       O=C=Nc1c(C)cc(Cl)cc1           998.63     0    0     0     

In [12]:
### Library Size

del_size = len(df_suzuki_out)*len(df_suzuki_in) *(len(df_acyl)+len(df_urea)+len(df_sulf))
print(del_size)

49300650


In [13]:
mol_disp = mol_random(df_red_amin, 'MFCD', 'SMILES', 1)
red_amin = mol_disp

mols2grid.display(mol_disp,
                 # RDKit's MolDrawOptions parameters
                  fixedBondLength=30,
                  # rename fields for the output document
                  subset=['MFCD', 'img', 'SMILES'],
                  tooltip=['SMILES'],
                  MolDrawingOptions=opts,
                  hover_color = 'blue',
                  n_cols = 6
                )



In [14]:
mol_disp = mol_random_preset(df_suzuki_in, 'MFCD', 'SMILES', 150, alk_keep)
suzuki_in = mol_disp

#print(len(alk_keep), alk_keep)

mols2grid.display(mol_disp,
                 # RDKit's MolDrawOptions parameters
                  fixedBondLength=30,
                  # rename fields for the output document
                  subset=['MFCD', 'img', 'SMILES'],
                  tooltip=['SMILES'],
                  MolDrawingOptions=opts,
                  hover_color = 'blue',
                  n_cols = 6
                )



150 105 225 48 45 175


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [15]:
### Remove Duplicate between Boric Acid and Boric Pinacolate
#borac = Chem.MolFromSmarts('B(O)O')
bpin = Chem.MolFromSmarts('B1OC(C)(C)C(C)(C)O1')
boc = 'C(=O)C(C)(C)C'
#b1 = 'B(O)O'
#b2 = 'OB(O)'
s_temp = df_suzuki_out
s_temp['bpin'] = 0
s_temp['n_smi'] = ''

#print(s_temp)

for i in range(0, len(s_temp)):
    a = s_temp.iloc[i]['SMILES']
    a = remove_groups_in_smiles(a, boc)
    m = Chem.MolFromSmiles(a)
    b = m.GetSubstructMatches(bpin)
    d = s_temp.index[i]
    #print(i, a, b, len(b))
    if len(b) != 0:
        s_temp.loc[d,'bpin'] = 1
        m_new = remove_groups_in_smiles(a, 'B1OC(C)(C)C(C)(C)O1')
        #print(a, m_new)
        s_temp.loc[d,'n_smi'] = m_new
    else:
        m_new = remove_groups_in_smiles(a, 'B(O)O')
        #print(a, m_new)
        s_temp.loc[d,'n_smi'] = m_new

a = s_temp.sort_values(by=['n_smi','bpin'])
s_temp = a.drop_duplicates(subset='n_smi', keep="first")
#print(s_temp)
#print()
#print(len(s_temp))


### Now select Suzuki set

mol_disp = mol_random_preset(s_temp, 'MFCD', 'SMILES', 100, suz_keep)
suzuki_out = mol_disp


mols2grid.display(mol_disp,
                 # RDKit's MolDrawOptions parameters
                  fixedBondLength=30,
                  # rename fields for the output document
                  subset=['MFCD', 'img', 'SMILES'],
                  tooltip=['SMILES'],
                  MolDrawingOptions=opts,
                  hover_color = 'blue',
                  n_cols = 6
                )



100 80 26 20 20 6


In [16]:
mol_disp = mol_random(df_amid_extend, 'MFCD', 'SMILES', 1)
amid_extend = mol_disp

mols2grid.display(mol_disp,
                 # RDKit's MolDrawOptions parameters
                  fixedBondLength=30,
                  # rename fields for the output document
                  subset=['MFCD', 'img', 'SMILES'],
                  tooltip=['SMILES'],
                  MolDrawingOptions=opts,
                  hover_color = 'blue',
                  n_cols = 6
                )



In [17]:
mol_disp = mol_random_preset(df_urea, 'MFCD', 'SMILES', 50, urea_keep)
urea = mol_disp

mols2grid.display(mol_disp,
                 # RDKit's MolDrawOptions parameters
                  fixedBondLength=30,
                  # rename fields for the output document
                  subset=['MFCD', 'img', 'SMILES'],
                  tooltip=['SMILES'],
                  MolDrawingOptions=opts,
                  hover_color = 'blue',
                  n_cols = 6
                )

50 41 9 9 9 0


In [18]:
mol_disp = mol_random_preset(df_acyl, 'MFCD', 'SMILES', 150, acid_keep)
acyl = mol_disp

mols2grid.display(mol_disp,
                 # RDKit's MolDrawOptions parameters
                  fixedBondLength=30,
                  # rename fields for the output document
                  subset=['MFCD', 'img', 'SMILES'],
                  tooltip=['SMILES'],
                  MolDrawingOptions=opts,
                  hover_color = 'blue',
                  n_cols = 6
                )

150 77 5583 73 73 5510


In [19]:
mol_disp = mol_random_preset(df_sulf, 'MFCD', 'SMILES', 50, sulf_keep)
sulf = mol_disp

mols2grid.display(mol_disp,
                 # RDKit's MolDrawOptions parameters
                  fixedBondLength=30,
                  # rename fields for the output document
                  subset=['MFCD', 'img', 'SMILES'],
                  tooltip=['SMILES'],
                  MolDrawingOptions=opts,
                  hover_color = 'blue',
                  n_cols = 6
                )

50 6 330 44 44 286


In [20]:
# Create a Pandas Excel writer using XlsxWriter as the engine.
xls_out = '../Wuxi_DEL.xlsx'

# Write each dataframe to a different worksheet.
with pd.ExcelWriter(xls_out) as writer:
    #red_amin.to_excel(writer, sheet_name='1_Reductive_Amination', index = False)
    suzuki_in.to_excel(writer, sheet_name='2_Reductive_Alkylation', index = False)
    suzuki_out.to_excel(writer, sheet_name='3_Suzuki', index = False)
    #amid_extend.to_excel(writer, sheet_name='4_Amidation', index = False)
    urea.to_excel(writer, sheet_name='4_Capping_Urea', index = False)
    acyl.to_excel(writer, sheet_name='4_Capping_Acyl', index = False)
    sulf.to_excel(writer, sheet_name='4_Capping_Sulfonyl', index = False)

# Close the Pandas Excel writer and output the Excel file.
#writer.save()

In [21]:
#df_wuxi_val = pd.concat(pd.read_excel('../Wuxi_DEL_1_v2 with validation yield.xlsx', sheet_name = None), ignore_index = True)
#print(df_wuxi_val)
#df_wuxi_val.to_csv('../Wuxi_DEL_1_v2 with validation yield.csv', index=False)
#a = df_wuxi_val[df_wuxi_val['Yield %'] < 80]
#print(a)

num_sheets = 5
sheets = dict()

for i in range(num_sheets):
    sheets[i] = pd.read_excel('../Wuxi_DEL_1_v2 with validation yield.xlsx', sheet_name = i)
    
val_alkyl = sheets[0]
val_alkyl['React'] = 'Alkylation'
val_alkyl = df_alkyl[0:150]
#print(df_alkyl)
val_suzuki = sheets[1]
val_suzuki['React'] = 'Suzuki'
#print(val_suzuki)
val_urea = sheets[2]
val_urea['React'] = 'Capping'
#print(val_urea)
val_acid = sheets[3]
val_acid['React'] = 'Capping'
val_sulf = sheets[4]
val_sulf['React'] = 'Capping'
#val_suzuki = df_alkyl[0:150]
#print(val_sulf)

#df_wuxi_val = pd.concat([val_alkyl, val_suzuki,val_urea, val_acid, val_sulf])
#df_wuxi_val.to_csv('../Wuxi_DEL_1_v2 with validation yield.csv', index=False)


NameError: name 'df_alkyl' is not defined

In [None]:
### Replacing an atom with a different functional group in SMILES
def replace_groups_in_smiles(smiles, group_fragment, fgroup_smiles, replace_with):
    core_mol = Chem.MolFromSmiles(smiles)
    pattern_mol = Chem.MolFromSmiles(group_fragment)
    replacement_mol = Chem.MolFromSmiles(replace_with)
    
    products = AllChem.ReplaceSubstructs(core_mol, pattern_mol, replacement_mol)
    replaced_smiles = Chem.MolToSmiles(products[0])

    return replaced_smiles
