In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from rdkit import DataStructs

In [2]:
def remove_all_zero_col(data):
    data = data.copy()
    for col in data.columns:
        if (data[col] == 0).all():
            data.drop(col, axis=1, inplace=True)
    return data

In [3]:
data = pd.read_csv("source/Yields.csv", index_col=0)
Yield = pd.DataFrame(data['Yield'], columns=['Yield'])
Ligand_name_No = data.drop(columns=['Ligand_smiles', 'Substrate_name', 'Substrate_No', 'Substrate_smiles', 'Yield'])
Substrate_name_No = data.drop(columns=['Ligand_name', 'Ligand_No', 'Ligand_smiles', 'Substrate_smiles', 'Yield'])

print(Yield)
print(Ligand_name_No)
print(Substrate_name_No)

       Yield
Entry       
0         20
1          9
2          7
3         18
4         34
...      ...
295       86
296        1
297        2
298        9
299        0

[300 rows x 1 columns]
      Ligand_name Ligand_No
Entry                      
0            dppm        L1
1            dppm        L1
2            dppm        L1
3            dppm        L1
4            dppm        L1
...           ...       ...
295         sphos       L30
296         sphos       L30
297         sphos       L30
298         sphos       L30
299         sphos       L30

[300 rows x 2 columns]
                    Substrate_name Substrate_No
Entry                                          
0                     benzophenone           1a
1        4,4'-dimethylbenzophenone           1b
2       4,4'-dimethoxybenzophenone           1c
3        4,4'-difluorobenzophenone           1d
4                     acetophenone           1e
...                            ...          ...
295    2,2,2-trifluoroacetophenone 

In [4]:
# MK, Ligand
mk_l = []
for i in range(0,300):
    mol_MK_l = Chem.MolFromSmiles(data["Ligand_smiles"][i])
    fp_MK_l = MACCSkeys.GenMACCSKeys(mol_MK_l)
    fp_arr_l = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp_MK_l, fp_arr_l)
    arr_MK_l = np.array(fp_arr_l)
    mk_l.append(arr_MK_l)

df_l = pd.DataFrame(mk_l)
df_l = df_l.add_prefix('L_MK_')
print(df_l.shape)
df_2_l = remove_all_zero_col(df_l)
print(df_2_l.shape)
print(df_2_l)

(300, 167)
(300, 60)
     L_MK_9  L_MK_28  L_MK_29  L_MK_42  L_MK_44  L_MK_49  L_MK_57  L_MK_62  \
0       0.0      1.0      1.0      0.0      0.0      0.0      0.0      0.0   
1       0.0      1.0      1.0      0.0      0.0      0.0      0.0      0.0   
2       0.0      1.0      1.0      0.0      0.0      0.0      0.0      0.0   
3       0.0      1.0      1.0      0.0      0.0      0.0      0.0      0.0   
4       0.0      1.0      1.0      0.0      0.0      0.0      0.0      0.0   
..      ...      ...      ...      ...      ...      ...      ...      ...   
295     0.0      0.0      1.0      0.0      0.0      0.0      0.0      1.0   
296     0.0      0.0      1.0      0.0      0.0      0.0      0.0      1.0   
297     0.0      0.0      1.0      0.0      0.0      0.0      0.0      1.0   
298     0.0      0.0      1.0      0.0      0.0      0.0      0.0      1.0   
299     0.0      0.0      1.0      0.0      0.0      0.0      0.0      1.0   

     L_MK_66  L_MK_74  ...  L_MK_153  L_MK

In [5]:
# MF, Substrate
mk_s = []
for i in range(0,300):
    mol_MK_s = Chem.MolFromSmiles(data["Substrate_smiles"][i])
    fp_MK_s = MACCSkeys.GenMACCSKeys(mol_MK_s)
    fp_arr_s = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp_MK_s, fp_arr_s)
    arr_MK_s = np.array(fp_arr_s)
    mk_s.append(arr_MK_s)

df_s = pd.DataFrame(mk_s)
df_s = df_s.add_prefix('S_MK_')
print(df_s.shape)
df_2_s = remove_all_zero_col(df_s)
print(df_2_s.shape)
print(df_2_s)

(300, 167)
(300, 32)
     S_MK_42  S_MK_87  S_MK_93  S_MK_101  S_MK_105  S_MK_106  S_MK_107  \
0        0.0      0.0      0.0       0.0       0.0       0.0       0.0   
1        0.0      0.0      0.0       0.0       0.0       0.0       0.0   
2        0.0      0.0      1.0       0.0       0.0       0.0       0.0   
3        1.0      1.0      0.0       0.0       0.0       0.0       1.0   
4        0.0      0.0      0.0       0.0       0.0       0.0       0.0   
..       ...      ...      ...       ...       ...       ...       ...   
295      1.0      0.0      0.0       0.0       0.0       1.0       1.0   
296      0.0      0.0      0.0       0.0       0.0       0.0       0.0   
297      0.0      0.0      0.0       1.0       1.0       0.0       0.0   
298      0.0      0.0      0.0       1.0       1.0       0.0       0.0   
299      0.0      0.0      0.0       0.0       0.0       0.0       0.0   

     S_MK_112  S_MK_113  S_MK_118  ...  S_MK_149  S_MK_152  S_MK_154  \
0         0.0     

In [6]:
MK = pd.concat([Ligand_name_No, df_2_l, Substrate_name_No, df_2_s, Yield], axis=1, join='inner')
print(MK)
MK.to_csv('../data/MK.csv', index = False)

    Ligand_name Ligand_No  L_MK_9  L_MK_28  L_MK_29  L_MK_42  L_MK_44  \
0          dppm        L1     0.0      1.0      1.0      0.0      0.0   
1          dppm        L1     0.0      1.0      1.0      0.0      0.0   
2          dppm        L1     0.0      1.0      1.0      0.0      0.0   
3          dppm        L1     0.0      1.0      1.0      0.0      0.0   
4          dppm        L1     0.0      1.0      1.0      0.0      0.0   
..          ...       ...     ...      ...      ...      ...      ...   
295       sphos       L30     0.0      0.0      1.0      0.0      0.0   
296       sphos       L30     0.0      0.0      1.0      0.0      0.0   
297       sphos       L30     0.0      0.0      1.0      0.0      0.0   
298       sphos       L30     0.0      0.0      1.0      0.0      0.0   
299       sphos       L30     0.0      0.0      1.0      0.0      0.0   

     L_MK_49  L_MK_57  L_MK_62  ...  S_MK_152  S_MK_154  S_MK_157  S_MK_159  \
0        0.0      0.0      0.0  ...       1.