In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
def remove_all_zero_col(data):
    data = data.copy()
    for col in data.columns:
        if (data[col] == 0).all():
            data.drop(col, axis=1, inplace=True)
    return data

In [3]:
# Morgan Fingerprint (radius = 2)
data = pd.read_csv("source/Yields.csv", index_col=0)
Yield = pd.DataFrame(data['Yield'], columns=['Yield'])
Ligand_name_No = data.drop(columns=['Ligand_smiles', 'Substrate_name', 'Substrate_No', 'Substrate_smiles', 'Yield'])
Substrate_name_No = data.drop(columns=['Ligand_name', 'Ligand_No', 'Ligand_smiles', 'Substrate_smiles', 'Yield'])

print(Yield)
print(Ligand_name_No)
print(Substrate_name_No)

       Yield
Entry       
0         20
1          9
2          7
3         18
4         34
...      ...
295       86
296        1
297        2
298        9
299        0

[300 rows x 1 columns]
      Ligand_name Ligand_No
Entry                      
0            dppm        L1
1            dppm        L1
2            dppm        L1
3            dppm        L1
4            dppm        L1
...           ...       ...
295         sphos       L30
296         sphos       L30
297         sphos       L30
298         sphos       L30
299         sphos       L30

[300 rows x 2 columns]
                    Substrate_name Substrate_No
Entry                                          
0                     benzophenone           1a
1        4,4'-dimethylbenzophenone           1b
2       4,4'-dimethoxybenzophenone           1c
3        4,4'-difluorobenzophenone           1d
4                     acetophenone           1e
...                            ...          ...
295    2,2,2-trifluoroacetophenone 

In [4]:
# MF, Ligand
mf_2_l = []
for i in range(0,300):
    mol_2_l = Chem.MolFromSmiles(data["Ligand_smiles"][i])
    fp_2_l = AllChem.GetMorganFingerprintAsBitVect(mol_2_l, 2, 2048)
    mf_2_l.append(np.array(fp_2_l, int))

df_2_l = pd.DataFrame(mf_2_l)
df_2_l = df_2_l.add_prefix('L_MF2_')
print(df_2_l.shape)
df_2_2_l = remove_all_zero_col(df_2_l)
print(df_2_2_l.shape)
print(df_2_2_l)

(300, 2048)
(300, 182)
     L_MF2_1  L_MF2_2  L_MF2_5  L_MF2_30  L_MF2_45  L_MF2_48  L_MF2_56  \
0          0        0        0         0         1         0         0   
1          0        0        0         0         1         0         0   
2          0        0        0         0         1         0         0   
3          0        0        0         0         1         0         0   
4          0        0        0         0         1         0         0   
..       ...      ...      ...       ...       ...       ...       ...   
295        0        1        0         0         1         1         0   
296        0        1        0         0         1         1         0   
297        0        1        0         0         1         1         0   
298        0        1        0         0         1         1         0   
299        0        1        0         0         1         1         0   

     L_MF2_75  L_MF2_80  L_MF2_90  ...  L_MF2_1926  L_MF2_1928  L_MF2_1953  \
0         

In [5]:
# MF, Substrate
mf_2_s = []
for i in range(0,300):
    mol_2_s = Chem.MolFromSmiles(data["Substrate_smiles"][i])
    fp_2_s = AllChem.GetMorganFingerprintAsBitVect(mol_2_s, 2, 2048)
    mf_2_s.append(np.array(fp_2_s, int))

df_2_s = pd.DataFrame(mf_2_s)
df_2_s = df_2_s.add_prefix('S_MF2_')
print(df_2_s.shape)
df_2_2_s = remove_all_zero_col(df_2_s)
print(df_2_2_s.shape)
print(df_2_2_s)

(300, 2048)
(300, 77)
     S_MF2_2  S_MF2_9  S_MF2_90  S_MF2_105  S_MF2_114  S_MF2_225  S_MF2_235  \
0          0        1         0          1          0          0          0   
1          0        1         0          1          0          0          1   
2          0        1         0          1          0          0          0   
3          0        1         1          1          0          0          0   
4          0        0         0          0          0          0          0   
..       ...      ...       ...        ...        ...        ...        ...   
295        0        0         0          0          1          0          0   
296        1        0         0          0          0          0          0   
297        1        0         0          0          0          0          0   
298        0        0         0          0          0          1          0   
299        1        0         0          0          0          0          0   

     S_MF2_242  S_MF2_293  S_

In [6]:
# PCA, Ligand
X_l = np.array(df_2_2_l)
sc = StandardScaler()
X_sc_l = sc.fit(X_l).transform(X_l)
pca_l = PCA(n_components=16)
X_pca_l = pca_l.fit(X_sc_l).transform(X_sc_l)
print("Before: {}".format(str(X_sc_l.shape)))
print("After: {}".format(str(X_pca_l.shape)))
print('sum of explained variance ratio: {0}'.format(sum(pca_l.explained_variance_ratio_)))
df_l = pd.DataFrame(X_pca_l)
df_l = df_l.add_prefix('L_MF_PC_')
print(df_l)

Before: (300, 182)
After: (300, 16)
sum of explained variance ratio: 0.8198004664192646
     L_MF_PC_0  L_MF_PC_1  L_MF_PC_2  L_MF_PC_3  L_MF_PC_4  L_MF_PC_5  \
0    -2.410078  -0.648546  -2.684240   0.843261   1.364202  -0.526848   
1    -2.410078  -0.648546  -2.684240   0.843261   1.364202  -0.526848   
2    -2.410078  -0.648546  -2.684240   0.843261   1.364202  -0.526848   
3    -2.410078  -0.648546  -2.684240   0.843261   1.364202  -0.526848   
4    -2.410078  -0.648546  -2.684240   0.843261   1.364202  -0.526848   
..         ...        ...        ...        ...        ...        ...   
295   9.656214   0.454562   0.842499  -0.617566  -2.402030  -2.177909   
296   9.656214   0.454562   0.842499  -0.617566  -2.402030  -2.177909   
297   9.656214   0.454562   0.842499  -0.617566  -2.402030  -2.177909   
298   9.656214   0.454562   0.842499  -0.617566  -2.402030  -2.177909   
299   9.656214   0.454562   0.842499  -0.617566  -2.402030  -2.177909   

     L_MF_PC_6  L_MF_PC_7  L_MF_PC_

In [7]:
# PCA, Substrate
X_s = np.array(df_2_2_s)
X_sc_s = sc.fit(X_s).transform(X_s)
pca_s = PCA(n_components=6)
X_pca_s = pca_s.fit(X_sc_s).transform(X_sc_s)
print("Before: {}".format(str(X_sc_s.shape)))
print("After: {}".format(str(X_pca_s.shape)))
print('sum of explained variance ratio: {0}'.format(sum(pca_s.explained_variance_ratio_)))
df_s = pd.DataFrame(X_pca_s)
df_s = df_s.add_prefix('S_MF_PC_')
print(df_s)

Before: (300, 77)
After: (300, 6)
sum of explained variance ratio: 0.8463134460878263
     S_MF_PC_0  S_MF_PC_1  S_MF_PC_2  S_MF_PC_3  S_MF_PC_4  S_MF_PC_5
0    -1.738976  -0.736113   1.180710  -0.241643  -0.069934   1.124281
1    -3.517142  -2.351154  -1.965914  -0.067334  -2.383908   2.406819
2    -4.718714  -3.647118  -4.149634  -0.223167   4.993940  -2.887405
3    -3.407302  -2.140233  -1.135399  -0.163764  -4.949665  -0.392103
4    -1.402136  -0.217859   2.970650  -0.348099   2.126987   4.842387
..         ...        ...        ...        ...        ...        ...
295  -1.635377   0.569874   7.156309  -1.059303  -0.703671  -4.425943
296   0.264344   6.061350   1.402472   0.130194   2.249308   1.344839
297   8.265388  -1.458391  -1.746587  -6.546150  -0.142835  -0.231464
298   7.255977  -3.844433   0.600541   6.672089   0.199564  -0.347641
299   0.633938   7.764077  -4.313148   1.847179  -1.319786  -1.433770

[300 rows x 6 columns]


In [8]:
MF_pca = pd.concat([Ligand_name_No, df_l, Substrate_name_No, df_s, Yield], axis=1, join='inner')
print(MF_pca)
MF_pca.to_csv('../data/MF2_pca.csv', index = False)

    Ligand_name Ligand_No  L_MF_PC_0  L_MF_PC_1  L_MF_PC_2  L_MF_PC_3  \
0          dppm        L1  -2.410078  -0.648546  -2.684240   0.843261   
1          dppm        L1  -2.410078  -0.648546  -2.684240   0.843261   
2          dppm        L1  -2.410078  -0.648546  -2.684240   0.843261   
3          dppm        L1  -2.410078  -0.648546  -2.684240   0.843261   
4          dppm        L1  -2.410078  -0.648546  -2.684240   0.843261   
..          ...       ...        ...        ...        ...        ...   
295       sphos       L30   9.656214   0.454562   0.842499  -0.617566   
296       sphos       L30   9.656214   0.454562   0.842499  -0.617566   
297       sphos       L30   9.656214   0.454562   0.842499  -0.617566   
298       sphos       L30   9.656214   0.454562   0.842499  -0.617566   
299       sphos       L30   9.656214   0.454562   0.842499  -0.617566   

     L_MF_PC_4  L_MF_PC_5  L_MF_PC_6  L_MF_PC_7  ...  L_MF_PC_15  \
0     1.364202  -0.526848  -0.952250  -2.580176  ...   