In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, Draw
from rdkit.Chem.Draw import rdDepictor, rdMolDraw2D
import mols2grid
import random

opts = Draw.DrawingOptions()

In [2]:
##### Function Definitions #####

### Determine Active Tags In Group Coulumn (WuXi Specific)
### Input: Data frame
### Output: List of "reactive groups"
def col_active(dfa):
    col = []
    for i in range(0, len(dfa)):
        a = dfa.iloc[i]['group']
        b = a.replace(" ", "")
        c = b.split(',')
        #print(a, b, c)
        for j in c:
            if j not in col:
                col.append(j)
        #print(i,col)
    return col

### Select Random Subset of Molecules
### Input: Data Frame with at least one identifier and SMILES, number of molecules to be picked
### Output: Data frame with Identifier, SMILES, and selected molecules
def mol_random(df, mid, smiles, n):
    m = pd.DataFrame(df, columns =[mid, smiles])
    if n > len(m):
        n = len(m)
    a = random.sample(range(0, len(m)), n)
    moldf = m.iloc[a]
    #print(moldf)
    return(moldf)


In [3]:
### Read Wuxi Buildimg Block EXCEL (all sheets)
### Drop Duplicate Rows in Data Frame
### Removal List

rem_list = ['MFCD00009801']

df_wuxi_raw = pd.concat(pd.read_excel('../Available building blocks.xlsx', sheet_name = None), ignore_index = True)

#print(df_wuxi_raw)

a = df_wuxi_raw[df_wuxi_raw['solid_stock(mg)'] >= 50]
#drop rows that contain any removal items
b = a[a['MFCD'].isin(rem_list) == False]

df_wuxi = b.drop_duplicates()

print(df_wuxi)

             MW            group          MFCD                    CAS  \
0      200.0753        ArBr, NH2  MFCD00008189             73918-56-6   
1      113.1209              NH2  MFCD28118594            107269-68-1   
2      144.2141              NH2  MFCD00006184               123-00-2   
3      125.1712              NH2  MFCD00009819              5036-48-6   
4      113.2001              NH2  MFCD00001495  33483-65-7, 6321-23-9   
...         ...              ...           ...                    ...   
21754  246.2596       COOH, BocN  MFCD00065571             13726-85-7   
21755  346.4197       COOH, BocN  MFCD24369442           1823807-74-4   
21756  265.3042       COOH, BocN  MFCD02090695             14676-01-8   
21757  270.3239  COOH, BocN, C=C  MFCD07369763            870703-78-9   
21759  348.4784      ArNH2, BocN  MFCD26407295           1032903-63-1   

                                                  SMILES  solid_stock(mg)  
0                                     NCCC1=CC=

In [4]:
### Expand Reagent List to Columns

col_wuxi = col_active(df_wuxi)
print(col_wuxi)

for i in col_wuxi:
    df_wuxi[i] = 0

print(df_wuxi)

['ArBr', 'NH2', 'BocN', 'COOMe', 'OH', 'COOH', 'FmocN', 'C≡C', 'ArCl', 'C=C', 'X', 'CbzN', 'COOEt', 'COOtBu', 'B(OH)2', 'ArNO2', 'NH', 'RCOR', 'CHO', 'ArI', 'ArF', 'NO2', 'ArNH', 'ArOH', 'CN', 'Bpin', 'COOiPr', 'N3', 'B(OCR)2', 'NH2COOH', 'BF3K', 'ArNH2', 'AllocN', 'SO2Cl', 'SO2F', 'CONH', 'SO2Na', 'CF3CH2O', 'NCO', 'BrCH2CO']
             MW            group          MFCD                    CAS  \
0      200.0753        ArBr, NH2  MFCD00008189             73918-56-6   
1      113.1209              NH2  MFCD28118594            107269-68-1   
2      144.2141              NH2  MFCD00006184               123-00-2   
3      125.1712              NH2  MFCD00009819              5036-48-6   
4      113.2001              NH2  MFCD00001495  33483-65-7, 6321-23-9   
...         ...              ...           ...                    ...   
21754  246.2596       COOH, BocN  MFCD00065571             13726-85-7   
21755  346.4197       COOH, BocN  MFCD24369442           1823807-74-4   
21756  265.304

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [5]:
### Enter Reactive Groups into Column matrix

for i in range(0, len(df_wuxi)):
    a = df_wuxi.iloc[i]['group']
    b = a.replace(" ", "")
    c = b.split(',')
    d = df_wuxi.index[i]
    #print(d, c)
    for j in c:
        #print(d, j)
        df_wuxi.loc[d,j] = 1
    #print(df_wuxi.iloc[i])

print(df_wuxi)

             MW            group          MFCD                    CAS  \
0      200.0753        ArBr, NH2  MFCD00008189             73918-56-6   
1      113.1209              NH2  MFCD28118594            107269-68-1   
2      144.2141              NH2  MFCD00006184               123-00-2   
3      125.1712              NH2  MFCD00009819              5036-48-6   
4      113.2001              NH2  MFCD00001495  33483-65-7, 6321-23-9   
...         ...              ...           ...                    ...   
21754  246.2596       COOH, BocN  MFCD00065571             13726-85-7   
21755  346.4197       COOH, BocN  MFCD24369442           1823807-74-4   
21756  265.3042       COOH, BocN  MFCD02090695             14676-01-8   
21757  270.3239  COOH, BocN, C=C  MFCD07369763            870703-78-9   
21759  348.4784      ArNH2, BocN  MFCD26407295           1032903-63-1   

                                                  SMILES  solid_stock(mg)  \
0                                     NCCC1=CC

In [6]:
### Reductive Alkylation on Resin

df_red_amin = df_wuxi[(df_wuxi['NH2'] == 1) 
                     & (df_wuxi['B(OH)2'] == 0) & (df_wuxi['Bpin'] == 0) & (df_wuxi['NH'] == 0)  
                     & (df_wuxi['N3'] == 0) & (df_wuxi['CHO'] == 0) & (df_wuxi['COOH'] == 0) & (df_wuxi['FmocN'] == 0)
                     & (df_wuxi['ArNH'] == 0) & (df_wuxi['COOtBu'] == 0)
                     & (df_wuxi['OH'] == 0)] 
#& (df_wuxi['BocN'] == 1) & (df_wuxi['NH'] == 0) 
                       #  & (df_wuxi['NH2'] == 0) & (df_wuxi['ArNH'] == 0) 
                       #  ]


print(col_active(df_red_amin))
print()
print(df_red_amin)

['ArBr', 'NH2', 'BocN', 'COOMe', 'C≡C', 'ArCl', 'C=C', 'X', 'CbzN', 'COOEt', 'ArNO2', 'RCOR', 'ArI', 'ArF', 'NO2', 'ArOH', 'CN', 'COOiPr']

            MW      group          MFCD                    CAS  \
0     200.0753  ArBr, NH2  MFCD00008189             73918-56-6   
1     113.1209        NH2  MFCD28118594            107269-68-1   
2     144.2141        NH2  MFCD00006184               123-00-2   
3     125.1712        NH2  MFCD00009819              5036-48-6   
4     113.2001        NH2  MFCD00001495  33483-65-7, 6321-23-9   
...        ...        ...           ...                    ...   
2200  236.3093        NH2  MFCD17927944           1282809-97-5   
2201  268.2752  NH2, BocN  MFCD11848049           1260901-24-3   
2202  114.1453        NH2  MFCD06658308            120436-02-4   
2204  183.3326        NH2  MFCD00003718              1502-03-0   
2205  242.3138  NH2, BocN  MFCD18793344           1273565-93-7   

                                             SMILES  solid_stock(mg

In [7]:
### Suzuki Attached to Resin

df_suzuki_in = df_wuxi[(df_wuxi['CHO'] == 1) & (df_wuxi['ArBr'] == 1) 
                       & (df_wuxi['COOH'] == 0)]

print(col_active(df_suzuki_in))
print()
print(df_suzuki_in)

['CHO', 'ArBr', 'ArNO2', 'OH', 'ArNH2', 'ArCl', 'ArF', 'COOMe', 'COOEt', 'BocN']

             MW                  group          MFCD           CAS  \
11267  203.0083              CHO, ArBr  MFCD00143261      100-52-7   
11272  220.9988              CHO, ArBr  MFCD08282773    57329-38-1   
11273  203.0083              CHO, ArBr  MFCD00070755  1306606-90-5   
11277  235.0764              CHO, ArBr  MFCD11101026   138505-25-6   
11296  263.9139        CHO, ArBr, ArBr  MFCD00156887  1018828-16-4   
...         ...                    ...           ...           ...   
15650  219.4629        CHO, ArBr, ArCl  MFCD12024585  1197050-28-4   
17318  248.0059  CHO, ArBr, ArNO2, ArF  MFCD10758067   679839-39-5   
17555  248.0059  CHO, ArBr, ArNO2, ArF  MFCD28739866   213382-40-2   
21353  381.2238        CHO, BocN, ArBr  WXCD00132230           NaN   
21363  381.2634        CHO, BocN, ArBr  WXCD00131861           NaN   

                                                  SMILES  solid_stock(mg)  \


In [8]:
### Suzuki into Resin

df_suzuki_out = df_wuxi[(df_wuxi['B(OH)2'] == 1) & (df_wuxi['BocN'] == 1) 
                        & (df_wuxi['COOH'] == 0) & (df_wuxi['ArBr'] == 0) 
                        & (df_wuxi['C=C'] == 0)]

print(col_active(df_suzuki_out))
print()
print(df_suzuki_out)

['B(OH)2', 'BocN']

             MW         group          MFCD           CAS  \
19108  306.1642  B(OH)2, BocN  MFCD10696664   937048-39-0   
19109  320.1907  B(OH)2, BocN  MFCD18384135  1190095-10-3   
19110  306.1642  B(OH)2, BocN  MFCD10696663   457613-78-4   
19113  334.1743  B(OH)2, BocN  MFCD12546575  1218790-82-9   
19129  334.1743  BocN, B(OH)2  MFCD12025996  1150114-76-3   
19132  306.1642  B(OH)2, BocN  MFCD09753310   915770-01-3   
19139  257.1137  BocN, B(OH)2  MFCD11504845  1072951-39-3   
19161  226.0368  B(OH)2, BocN  MFCD15143457  1217501-27-3   
19402  251.0859  B(OH)2, BocN  MFCD06246052   199609-62-6   
19407  251.0859  B(OH)2, BocN  MFCD04115637   489446-42-6   
19432  238.0475  B(OH)2, BocN  MFCD09907984   863752-59-4   
19437  241.0481  B(OH)2, BocN  MFCD11053783  1072946-49-6   
19441  279.0713  B(OH)2, BocN  MFCD11109422  1000068-65-4   
19445  261.0808  B(OH)2, BocN  MFCD02093045   213318-44-6   
19446  252.0740  BocN, B(OH)2  MFCD05664023   433969-29-0   
1945

In [9]:
### Amino Acid Extension After Suzuki

df_amid_extend = df_wuxi[(df_wuxi['COOH'] == 1) & (df_wuxi['BocN'] == 1) 
                         & (df_wuxi['NH'] == 0) & (df_wuxi['COOtBu'] == 0) 
                         & (df_wuxi['NH2'] == 0) & (df_wuxi['ArNH'] == 0) 
                         & (df_wuxi['B(OH)2'] == 0) & (df_wuxi['Bpin'] == 0)]


print(col_active(df_amid_extend))
print()
print(df_amid_extend)

['COOH', 'CHO', 'BocN', 'ArCl', 'ArF', 'FmocN', 'ArBr', 'OH', 'C=C', 'C≡C', 'COOMe', 'COOEt', 'ArI', 'ArNO2', 'RCOR', 'AllocN', 'CbzN']

             MW             group          MFCD           CAS  \
13768  440.4235   COOH, CHO, BocN  WXCD00135449           NaN   
13936  271.6963  COOH, BocN, ArCl  MFCD02682156   136290-47-6   
13942  271.6963  COOH, BocN, ArCl  MFCD02682158   253677-29-1   
14062  299.7493  COOH, BocN, ArCl  MFCD03427921   500770-74-1   
14064  299.7493  COOH, BocN, ArCl  MFCD02090709   284493-66-9   
...         ...               ...           ...           ...   
21750  281.3069        BocN, COOH  MFCD28657244           NaN   
21754  246.2596        COOH, BocN  MFCD00065571    13726-85-7   
21755  346.4197        COOH, BocN  MFCD24369442  1823807-74-4   
21756  265.3042        COOH, BocN  MFCD02090695    14676-01-8   
21757  270.3239   COOH, BocN, C=C  MFCD07369763   870703-78-9   

                                                  SMILES  solid_stock(mg)  \
13768

In [10]:
### Final Capping (Acylation, Sulfonylation, Urea)

df_urea = df_wuxi[(df_wuxi['NCO'] == 1)] 
                  #& df_wuxi['NH2'] == 0) 
                  #   & (df_wuxi['B(OH)2'] == 0) & (df_wuxi['Bpin'] == 0) & (df_wuxi['NH'] == 0)  
                  #   & (df_wuxi['N3'] == 0) & (df_wuxi['CHO'] == 0)
                  #   & (df_wuxi['ArNH'] == 0) & (df_wuxi['COOtBu'] == 0)] 
#& (df_wuxi['BocN'] == 1) & (df_wuxi['NH'] == 0) 
                       #  & (df_wuxi['NH2'] == 0) & (df_wuxi['ArNH'] == 0) 
                       #  ]


print(col_active(df_urea))
print()
print(df_urea)

df_acyl = df_wuxi[(df_wuxi['COOH'] == 1) 
                  & (df_wuxi['NH2'] == 0) & (df_wuxi['NH'] == 0) 
                  & (df_wuxi['B(OH)2'] == 0) & (df_wuxi['Bpin'] == 0) & (df_wuxi['NH'] == 0)  
                  & (df_wuxi['N3'] == 0) & (df_wuxi['CHO'] == 0)
                  & (df_wuxi['ArNH'] == 0) & (df_wuxi['BF3K'] == 0) 
                  & (df_wuxi['SO2F'] == 0) & (df_wuxi['SO2Cl'] == 0)
                  & (df_wuxi['FmocN'] == 0) & (df_wuxi['C=C'] == 0)
                    ]


print(col_active(df_acyl))
print()
print(df_acyl)

df_sulf = df_wuxi[(df_wuxi['SO2Cl'] == 1) 
                  & (df_wuxi['NH2'] == 0) & (df_wuxi['NH'] == 0) 
                  & (df_wuxi['B(OH)2'] == 0) & (df_wuxi['Bpin'] == 0) & (df_wuxi['NH'] == 0)  
                  & (df_wuxi['N3'] == 0) & (df_wuxi['CHO'] == 0)
                  & (df_wuxi['ArNH'] == 0)
                  #& (df_wuxi['BocN'] == 1) 
                  & (df_wuxi['NH2'] == 0) #& (df_wuxi['ArNH'] == 0) 
                    ]


print(col_active(df_sulf))
print()
print(df_sulf)

['NCO', 'ArCl', 'ArBr', 'ArF']

             MW                  group          MFCD          CAS  \
13970  188.0107        NCO, ArCl, ArCl  MFCD00002002    5392-82-5   
13972  221.5636              ArCl, NCO  MFCD00037029   50528-86-4   
14381  183.5915              ArCl, NCO  MFCD02093651   28395-76-8   
14398  221.5636              ArCl, NCO  MFCD00013874     327-78-6   
14588  167.5921              ArCl, NCO  MFCD00037036   40397-90-8   
14590  167.5921              ArCl, NCO  MFCD00019914   37408-18-7   
14592  222.4558  NCO, ArCl, ArCl, ArCl  MFCD00013845    2505-31-9   
14594  171.5561              ArCl, NCO  MFCD00037037   50529-33-4   
14595  171.5561              ArCl, NCO  MFCD03701597   69922-25-4   
14597  213.6174              ArCl, NCO  MFCD00013857   55440-55-6   
14605  221.5636              ArCl, NCO  MFCD00013873   16588-69-5   
14612  188.0107        NCO, ArCl, ArCl  MFCD00002003   39920-37-1   
14616  167.5921              ArCl, NCO  MFCD00037030   40398-01-4   
14

In [11]:
### Library Size

del_size = len(df_amid_extend)*len(df_suzuki_out)*len(df_suzuki_in)*len(df_red_amin)
print(del_size)

15503227740


In [12]:
mol_disp = mol_random(df_red_amin, 'MFCD', 'SMILES', 20)
red_amin = mol_disp

mols2grid.display(mol_disp,
                 # RDKit's MolDrawOptions parameters
                  fixedBondLength=30,
                  # rename fields for the output document
                  subset=['MFCD', 'img', 'SMILES'],
                  tooltip=['SMILES'],
                  MolDrawingOptions=opts,
                  hover_color = 'blue',
                  n_cols = 6
                )



In [13]:
mol_disp = mol_random(df_suzuki_in, 'MFCD', 'SMILES', 10)
suzuki_in = mol_disp

mols2grid.display(mol_disp,
                 # RDKit's MolDrawOptions parameters
                  fixedBondLength=30,
                  # rename fields for the output document
                  subset=['MFCD', 'img', 'SMILES'],
                  tooltip=['SMILES'],
                  MolDrawingOptions=opts,
                  hover_color = 'blue',
                  n_cols = 6
                )



In [14]:
mol_disp = mol_random(df_suzuki_out, 'MFCD', 'SMILES', 20)
suzuki_out = mol_disp

mols2grid.display(mol_disp,
                 # RDKit's MolDrawOptions parameters
                  fixedBondLength=30,
                  # rename fields for the output document
                  subset=['MFCD', 'img', 'SMILES'],
                  tooltip=['SMILES'],
                  MolDrawingOptions=opts,
                  hover_color = 'blue',
                  n_cols = 6
                )



In [15]:
mol_disp = mol_random(df_amid_extend, 'MFCD', 'SMILES', 10)
amid_extend = mol_disp

mols2grid.display(mol_disp,
                 # RDKit's MolDrawOptions parameters
                  fixedBondLength=30,
                  # rename fields for the output document
                  subset=['MFCD', 'img', 'SMILES'],
                  tooltip=['SMILES'],
                  MolDrawingOptions=opts,
                  hover_color = 'blue',
                  n_cols = 6
                )



In [16]:
mol_disp = mol_random(df_urea, 'MFCD', 'SMILES', 5)
urea = mol_disp

mols2grid.display(mol_disp,
                 # RDKit's MolDrawOptions parameters
                  fixedBondLength=30,
                  # rename fields for the output document
                  subset=['MFCD', 'img', 'SMILES'],
                  tooltip=['SMILES'],
                  MolDrawingOptions=opts,
                  hover_color = 'blue',
                  n_cols = 6
                )

In [17]:
mol_disp = mol_random(df_acyl, 'MFCD', 'SMILES', 10)
acyl = mol_disp

mols2grid.display(mol_disp,
                 # RDKit's MolDrawOptions parameters
                  fixedBondLength=30,
                  # rename fields for the output document
                  subset=['MFCD', 'img', 'SMILES'],
                  tooltip=['SMILES'],
                  MolDrawingOptions=opts,
                  hover_color = 'blue',
                  n_cols = 6
                )

In [18]:
mol_disp = mol_random(df_sulf, 'MFCD', 'SMILES', 10)
sulf = mol_disp

mols2grid.display(mol_disp,
                 # RDKit's MolDrawOptions parameters
                  fixedBondLength=30,
                  # rename fields for the output document
                  subset=['MFCD', 'img', 'SMILES'],
                  tooltip=['SMILES'],
                  MolDrawingOptions=opts,
                  hover_color = 'blue',
                  n_cols = 6
                )

In [22]:
# Create a Pandas Excel writer using XlsxWriter as the engine.
xls_out = '../Wuxi_DEL.xlsx'

# Write each dataframe to a different worksheet.
with pd.ExcelWriter(xls_out) as writer:
    red_amin.to_excel(writer, sheet_name='1_Reductive_Amination', index = False)
    suzuki_in.to_excel(writer, sheet_name='2_Reductive_Alkylation', index = False)
    suzuki_out.to_excel(writer, sheet_name='3_Suzuki', index = False)
    amid_extend.to_excel(writer, sheet_name='4_Amidation', index = False)
    urea.to_excel(writer, sheet_name='5_Capping_Urea', index = False)
    acyl.to_excel(writer, sheet_name='5_Capping_Acyl', index = False)
    sulf.to_excel(writer, sheet_name='5_Capping_Sulfonyl', index = False)

# Close the Pandas Excel writer and output the Excel file.
#writer.save()