In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem

In [2]:
def remove_all_zero_col(data):
    data = data.copy()
    for col in data.columns:
        if (data[col] == 0).all():
            data.drop(col, axis=1, inplace=True)
    return data

In [3]:
# Morgan Fingerprint (radius = 2)
data = pd.read_csv("source/Yields.csv", index_col=0)
Yield = pd.DataFrame(data['Yield'], columns=['Yield'])
Ligand_name_No = data.drop(columns=['Ligand_smiles', 'Substrate_name', 'Substrate_No', 'Substrate_smiles', 'Yield'])
Substrate_name_No = data.drop(columns=['Ligand_name', 'Ligand_No', 'Ligand_smiles', 'Substrate_smiles', 'Yield'])

print(Yield)
print(Ligand_name_No)
print(Substrate_name_No)

       Yield
Entry       
0         20
1          9
2          7
3         18
4         34
...      ...
295       86
296        1
297        2
298        9
299        0

[300 rows x 1 columns]
      Ligand_name Ligand_No
Entry                      
0            dppm        L1
1            dppm        L1
2            dppm        L1
3            dppm        L1
4            dppm        L1
...           ...       ...
295         sphos       L30
296         sphos       L30
297         sphos       L30
298         sphos       L30
299         sphos       L30

[300 rows x 2 columns]
                    Substrate_name Substrate_No
Entry                                          
0                     benzophenone           1a
1        4,4'-dimethylbenzophenone           1b
2       4,4'-dimethoxybenzophenone           1c
3        4,4'-difluorobenzophenone           1d
4                     acetophenone           1e
...                            ...          ...
295    2,2,2-trifluoroacetophenone 

In [4]:
# MF, Ligand
mf_2_l = []
for i in range(0,300):
    mol_2_l = Chem.MolFromSmiles(data["Ligand_smiles"][i])
    fp_2_l = AllChem.GetMorganFingerprintAsBitVect(mol_2_l, 2, 2048)
    mf_2_l.append(np.array(fp_2_l, int))

df_2_l = pd.DataFrame(mf_2_l)
df_2_l = df_2_l.add_prefix('L_MF2_')
print(df_2_l.shape)
df_2_2_l = remove_all_zero_col(df_2_l)
print(df_2_2_l.shape)
print(df_2_2_l)

(300, 2048)
(300, 182)
     L_MF2_1  L_MF2_2  L_MF2_5  L_MF2_30  L_MF2_45  L_MF2_48  L_MF2_56  \
0          0        0        0         0         1         0         0   
1          0        0        0         0         1         0         0   
2          0        0        0         0         1         0         0   
3          0        0        0         0         1         0         0   
4          0        0        0         0         1         0         0   
..       ...      ...      ...       ...       ...       ...       ...   
295        0        1        0         0         1         1         0   
296        0        1        0         0         1         1         0   
297        0        1        0         0         1         1         0   
298        0        1        0         0         1         1         0   
299        0        1        0         0         1         1         0   

     L_MF2_75  L_MF2_80  L_MF2_90  ...  L_MF2_1926  L_MF2_1928  L_MF2_1953  \
0         

In [5]:
# MF, Substrate
mf_2_s = []
for i in range(0,300):
    mol_2_s = Chem.MolFromSmiles(data["Substrate_smiles"][i])
    fp_2_s = AllChem.GetMorganFingerprintAsBitVect(mol_2_s, 2, 2048)
    mf_2_s.append(np.array(fp_2_s, int))

df_2_s = pd.DataFrame(mf_2_s)
df_2_s = df_2_s.add_prefix('S_MF2_')
print(df_2_s.shape)
df_2_2_s = remove_all_zero_col(df_2_s)
print(df_2_2_s.shape)
print(df_2_2_s)

(300, 2048)
(300, 77)
     S_MF2_2  S_MF2_9  S_MF2_90  S_MF2_105  S_MF2_114  S_MF2_225  S_MF2_235  \
0          0        1         0          1          0          0          0   
1          0        1         0          1          0          0          1   
2          0        1         0          1          0          0          0   
3          0        1         1          1          0          0          0   
4          0        0         0          0          0          0          0   
..       ...      ...       ...        ...        ...        ...        ...   
295        0        0         0          0          1          0          0   
296        1        0         0          0          0          0          0   
297        1        0         0          0          0          0          0   
298        0        0         0          0          0          1          0   
299        1        0         0          0          0          0          0   

     S_MF2_242  S_MF2_293  S_

In [6]:
MF = pd.concat([Ligand_name_No, df_2_2_l, Substrate_name_No, df_2_2_s, Yield], axis=1, join='inner')
print(MF)
MF.to_csv('../data/MF2.csv', index = False)

    Ligand_name Ligand_No  L_MF2_1  L_MF2_2  L_MF2_5  L_MF2_30  L_MF2_45  \
0          dppm        L1        0        0        0         0         1   
1          dppm        L1        0        0        0         0         1   
2          dppm        L1        0        0        0         0         1   
3          dppm        L1        0        0        0         0         1   
4          dppm        L1        0        0        0         0         1   
..          ...       ...      ...      ...      ...       ...       ...   
295       sphos       L30        0        1        0         0         1   
296       sphos       L30        0        1        0         0         1   
297       sphos       L30        0        1        0         0         1   
298       sphos       L30        0        1        0         0         1   
299       sphos       L30        0        1        0         0         1   

     L_MF2_48  L_MF2_56  L_MF2_75  ...  S_MF2_1820  S_MF2_1823  S_MF2_1865  \
0        