In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from rdkit import DataStructs

In [2]:
def remove_all_zero_col(data):
    data = data.copy()
    for col in data.columns:
        if (data[col] == 0).all():
            data.drop(col, axis=1, inplace=True)
    return data

In [3]:
data = pd.read_csv("source/Ligand_candidate.csv")
Substrate_name = data.drop(columns=['Ligand_name', 'Ligand_smiles', 'Substrate_smiles', 'Yield'])
Ligand_name = data.drop(columns=['Substrate_name', 'Ligand_smiles', 'Substrate_smiles', 'Yield'])
Ligand_smiles = data['Ligand_smiles']
Yields = data['Yield']

print(Substrate_name)
print(Ligand_name)
print(Ligand_smiles)
print(Yields)

                 Substrate_name
0                  benzophenone
1     4,4'-dimethylbenzophenone
2    4,4'-dimethoxybenzophenone
3     4,4'-difluorobenzophenone
4                  acetophenone
..                          ...
367             alpha-tetralone
368             alpha-tetralone
369             alpha-tetralone
370             alpha-tetralone
371             alpha-tetralone

[372 rows x 1 columns]
    Ligand_name
0            L1
1            L1
2            L1
3            L1
4            L1
..          ...
367         L98
368         L99
369        L100
370        L101
371        L102

[372 rows x 1 columns]
0      P(C1=CC=CC=C1)(C2=CC=CC=C2)CP(C3=CC=CC=C3)C4=C...
1      P(C1=CC=CC=C1)(C2=CC=CC=C2)CP(C3=CC=CC=C3)C4=C...
2      P(C1=CC=CC=C1)(C2=CC=CC=C2)CP(C3=CC=CC=C3)C4=C...
3      P(C1=CC=CC=C1)(C2=CC=CC=C2)CP(C3=CC=CC=C3)C4=C...
4      P(C1=CC=CC=C1)(C2=CC=CC=C2)CP(C3=CC=CC=C3)C4=C...
                             ...                        
367    O=S(C1=CC=CC(P(C2=CC=CC=C2)

In [4]:
# MK, Ligand
mk_l = []
for i in range(0,372):
    mol_MK_l = Chem.MolFromSmiles(data["Ligand_smiles"][i])
    fp_MK_l = MACCSkeys.GenMACCSKeys(mol_MK_l)
    fp_arr_l = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp_MK_l, fp_arr_l)
    arr_MK_l = np.array(fp_arr_l)
    mk_l.append(arr_MK_l)

df_l = pd.DataFrame(mk_l)
df_l = df_l.add_prefix('L_MK_')
print(df_l.shape)
df_2_l = remove_all_zero_col(df_l)
print(df_2_l.shape)
print(df_2_l)

(372, 167)
(372, 116)
     L_MK_9  L_MK_28  L_MK_29  L_MK_30  L_MK_31  L_MK_35  L_MK_36  L_MK_39  \
0       0.0      1.0      1.0      0.0      0.0      0.0      0.0      0.0   
1       0.0      1.0      1.0      0.0      0.0      0.0      0.0      0.0   
2       0.0      1.0      1.0      0.0      0.0      0.0      0.0      0.0   
3       0.0      1.0      1.0      0.0      0.0      0.0      0.0      0.0   
4       0.0      1.0      1.0      0.0      0.0      0.0      0.0      0.0   
..      ...      ...      ...      ...      ...      ...      ...      ...   
367     0.0      0.0      1.0      0.0      0.0      1.0      0.0      1.0   
368     0.0      0.0      1.0      0.0      0.0      0.0      0.0      0.0   
369     0.0      0.0      1.0      0.0      0.0      0.0      0.0      0.0   
370     0.0      0.0      1.0      0.0      0.0      0.0      0.0      0.0   
371     0.0      0.0      1.0      0.0      0.0      0.0      0.0      0.0   

     L_MK_40  L_MK_41  ...  L_MK_157  L_M

In [5]:
# MF, Substrate
mk_s = []
for i in range(0,372):
    mol_MK_s = Chem.MolFromSmiles(data["Substrate_smiles"][i])
    fp_MK_s = MACCSkeys.GenMACCSKeys(mol_MK_s)
    fp_arr_s = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp_MK_s, fp_arr_s)
    arr_MK_s = np.array(fp_arr_s)
    mk_s.append(arr_MK_s)

df_s = pd.DataFrame(mk_s)
df_s = df_s.add_prefix('S_MK_')
print(df_s.shape)
df_2_s = remove_all_zero_col(df_s)
print(df_2_s.shape)
print(df_2_s)

(372, 167)
(372, 32)
     S_MK_42  S_MK_87  S_MK_93  S_MK_101  S_MK_105  S_MK_106  S_MK_107  \
0        0.0      0.0      0.0       0.0       0.0       0.0       0.0   
1        0.0      0.0      0.0       0.0       0.0       0.0       0.0   
2        0.0      0.0      1.0       0.0       0.0       0.0       0.0   
3        1.0      1.0      0.0       0.0       0.0       0.0       1.0   
4        0.0      0.0      0.0       0.0       0.0       0.0       0.0   
..       ...      ...      ...       ...       ...       ...       ...   
367      0.0      0.0      0.0       1.0       1.0       0.0       0.0   
368      0.0      0.0      0.0       1.0       1.0       0.0       0.0   
369      0.0      0.0      0.0       1.0       1.0       0.0       0.0   
370      0.0      0.0      0.0       1.0       1.0       0.0       0.0   
371      0.0      0.0      0.0       1.0       1.0       0.0       0.0   

     S_MK_112  S_MK_113  S_MK_118  ...  S_MK_149  S_MK_152  S_MK_154  \
0         0.0     

In [6]:
MK = pd.concat([Ligand_name, Ligand_smiles, df_2_l, Substrate_name, df_2_s, Yields], axis=1, join='inner')
print(MK)
MK.to_csv('../data/MK2.csv', index = False)

    Ligand_name                                      Ligand_smiles  L_MK_9  \
0            L1  P(C1=CC=CC=C1)(C2=CC=CC=C2)CP(C3=CC=CC=C3)C4=C...     0.0   
1            L1  P(C1=CC=CC=C1)(C2=CC=CC=C2)CP(C3=CC=CC=C3)C4=C...     0.0   
2            L1  P(C1=CC=CC=C1)(C2=CC=CC=C2)CP(C3=CC=CC=C3)C4=C...     0.0   
3            L1  P(C1=CC=CC=C1)(C2=CC=CC=C2)CP(C3=CC=CC=C3)C4=C...     0.0   
4            L1  P(C1=CC=CC=C1)(C2=CC=CC=C2)CP(C3=CC=CC=C3)C4=C...     0.0   
..          ...                                                ...     ...   
367         L98  O=S(C1=CC=CC(P(C2=CC=CC=C2)C3=CC=CC=C3)=C1)([O...     0.0   
368         L99  CC(P(C1=CC=C(N(C)C)C=C1)C2=CC=C(N(C)C)C=C2)CC(...     0.0   
369        L100  CC(P(C1=CC(C)=CC(C)=C1)C2=CC(C)=CC(C)=C2)CC(C)...     0.0   
370        L101                              CCCCCCP(CCCCCC)CCCCCC     0.0   
371        L102                        CCCCCCCCP(CCCCCCCC)CCCCCCCC     0.0   

     L_MK_28  L_MK_29  L_MK_30  L_MK_31  L_MK_35  L_MK_36  L_MK