In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from mordred import Calculator, descriptors
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
def remove_all_zero_col(data):
    data = data.copy()
    for col in data.columns:
        if (data[col] == 0).all():
            data.drop(col, axis=1, inplace=True)
    return data

In [3]:
data = pd.read_csv("source/Yields.csv", index_col=0)
Yield = pd.DataFrame(data['Yield'], columns=['Yield'])
Ligand_name_No = data.drop(columns=['Ligand_smiles', 'Substrate_name', 'Substrate_No', 'Substrate_smiles', 'Yield'])
Substrate_name_No = data.drop(columns=['Ligand_name', 'Ligand_No', 'Ligand_smiles', 'Substrate_smiles', 'Yield'])

print(Yield)
print(Ligand_name_No)
print(Substrate_name_No)

       Yield
Entry       
0         20
1          9
2          7
3         18
4         34
...      ...
295       86
296        1
297        2
298        9
299        0

[300 rows x 1 columns]
      Ligand_name Ligand_No
Entry                      
0            dppm        L1
1            dppm        L1
2            dppm        L1
3            dppm        L1
4            dppm        L1
...           ...       ...
295         sphos       L30
296         sphos       L30
297         sphos       L30
298         sphos       L30
299         sphos       L30

[300 rows x 2 columns]
                    Substrate_name Substrate_No
Entry                                          
0                     benzophenone           1a
1        4,4'-dimethylbenzophenone           1b
2       4,4'-dimethoxybenzophenone           1c
3        4,4'-difluorobenzophenone           1d
4                     acetophenone           1e
...                            ...          ...
295    2,2,2-trifluoroacetophenone 

In [4]:
# mordred, Ligand
data['Mol'] = data["Ligand_smiles"].apply(Chem.MolFromSmiles)

calc_mordred_l = Calculator(descriptors, ignore_3D=True)
desc_mordred_l = calc_mordred_l.pandas(data['Mol'])

mordred_l = desc_mordred_l.astype(str)
masks_l = mordred_l.apply(lambda d: d.str.contains('[a-zA-Z]' ,na=False))
mordred_l = mordred_l[~masks_l]
mordred_l = mordred_l.astype(float)
df_mordred_l = mordred_l.dropna(how="any", axis="columns")

print(df_mordred_l.shape)
df_mordred_l_2 = remove_all_zero_col(df_mordred_l)
df_mordred_l_2 = df_mordred_l_2.add_prefix('L_')
print(df_mordred_l_2.shape)
print(df_mordred_l_2)

 36%|███▋      | 109/300 [00:03<00:04, 46.48it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 300/300 [00:06<00:00, 48.99it/s]


(300, 1126)
(300, 848)
       L_nAcid  L_nAromAtom  L_nAromBond  L_nAtom  L_nHeavyAtom  L_nHetero  \
Entry                                                                        
0          0.0         24.0         24.0     49.0          27.0        2.0   
1          0.0         24.0         24.0     49.0          27.0        2.0   
2          0.0         24.0         24.0     49.0          27.0        2.0   
3          0.0         24.0         24.0     49.0          27.0        2.0   
4          0.0         24.0         24.0     49.0          27.0        2.0   
...        ...          ...          ...      ...           ...        ...   
295        0.0         12.0         12.0     64.0          29.0        3.0   
296        0.0         12.0         12.0     64.0          29.0        3.0   
297        0.0         12.0         12.0     64.0          29.0        3.0   
298        0.0         12.0         12.0     64.0          29.0        3.0   
299        0.0         12.0         12.0 

In [5]:
# mordred, Substrate
data['Mol'] = data["Substrate_smiles"].apply(Chem.MolFromSmiles)

calc_mordred_s = Calculator(descriptors, ignore_3D=True)
desc_mordred_s = calc_mordred_s.pandas(data['Mol'])

mordred_s = desc_mordred_s.astype(str)
masks_s = mordred_s.apply(lambda d: d.str.contains('[a-zA-Z]' ,na=False))
mordred_s = mordred_s[~masks_s]
mordred_s = mordred_s.astype(float)
df_mordred_s = mordred_s.dropna(how="any", axis="columns")

print(df_mordred_s.shape)
df_mordred_s_2 = remove_all_zero_col(df_mordred_s)
df_mordred_s_2 = df_mordred_s_2.add_prefix('S_')
print(df_mordred_s_2.shape)
print(df_mordred_s_2)

100%|██████████| 300/300 [00:03<00:00, 78.77it/s] 


(300, 1347)
(300, 1040)
       S_SpAbs_A  S_SpMax_A  S_SpDiam_A   S_SpAD_A  S_SpMAD_A  S_LogEE_A  \
Entry                                                                      
0      18.814625   2.307250    4.614501  18.814625   1.343902   3.558646   
1      20.337424   2.334414    4.668828  20.337424   1.271089   3.691412   
2      23.678718   2.342923    4.685846  23.678718   1.315484   3.799532   
3      20.337424   2.334414    4.668828  20.337424   1.271089   3.691412   
4      11.189957   2.193993    4.387987  11.189957   1.243329   3.089765   
...          ...        ...         ...        ...        ...        ...   
295    14.216958   2.319378    4.638755  14.216958   1.184746   3.383391   
296    18.814625   2.307250    4.614501  18.814625   1.343902   3.558646   
297    14.495357   2.351365    4.702730  14.495357   1.317760   3.342101   
298    14.426921   2.333244    4.666488  14.426921   1.311538   3.341737   
299    18.814625   2.307250    4.614501  18.814625   1.343902   

In [6]:
# PCA, Ligand
X_l = np.array(df_mordred_l_2)
sc = StandardScaler()
X_sc_l = sc.fit(X_l).transform(X_l)
pca_l = PCA(n_components=7)
X_pca_l = pca_l.fit(X_sc_l).transform(X_sc_l)
print("Before: {}".format(str(X_sc_l.shape)))
print("After: {}".format(str(X_pca_l.shape)))
print('sum of explained variance ratio: {0}'.format(sum(pca_l.explained_variance_ratio_)))
df_l = pd.DataFrame(X_pca_l)
df_l = df_l.add_prefix('L_mordred_PC_')
print(df_l)

Before: (300, 848)
After: (300, 7)
sum of explained variance ratio: 0.8244507910135536
     L_mordred_PC_0  L_mordred_PC_1  L_mordred_PC_2  L_mordred_PC_3  \
0         -0.033146        0.403049       -5.521747       -2.631317   
1         -0.033146        0.403049       -5.521747       -2.631317   
2         -0.033146        0.403049       -5.521747       -2.631317   
3         -0.033146        0.403049       -5.521747       -2.631317   
4         -0.033146        0.403049       -5.521747       -2.631317   
..              ...             ...             ...             ...   
295       -7.118382       -7.079659        0.265425        3.641463   
296       -7.118382       -7.079659        0.265425        3.641463   
297       -7.118382       -7.079659        0.265425        3.641463   
298       -7.118382       -7.079659        0.265425        3.641463   
299       -7.118382       -7.079659        0.265425        3.641463   

     L_mordred_PC_4  L_mordred_PC_5  L_mordred_PC_6  
0     

In [7]:
# PCA, Substrate
X_s = np.array(df_mordred_s_2)
X_sc_s = sc.fit(X_s).transform(X_s)
pca_s = PCA(n_components=4)
X_pca_s = pca_s.fit(X_sc_s).transform(X_sc_s)
print("Before: {}".format(str(X_sc_s.shape)))
print("After: {}".format(str(X_pca_s.shape)))
print('sum of explained variance ratio: {0}'.format(sum(pca_s.explained_variance_ratio_)))
df_s = pd.DataFrame(X_pca_s)
df_s = df_s.add_prefix('S_mordred_PC_')
print(df_s)

Before: (300, 1040)
After: (300, 4)
sum of explained variance ratio: 0.8529308007528521
     S_mordred_PC_0  S_mordred_PC_1  S_mordred_PC_2  S_mordred_PC_3
0         -2.952700       -1.696122        0.522472      -12.491654
1        -17.955510        3.384030       -2.353062      -10.414886
2        -30.739914       12.165785      -13.099883       15.490445
3        -13.293242       16.738358      -10.608399       -6.540067
4         27.902825      -15.637521      -11.516570        2.542203
..              ...             ...             ...             ...
295       30.803235       37.104850       14.031181        3.215413
296       -7.659586       -8.849565       11.774728       -2.885880
297       14.860635      -12.871062       -6.218081        3.085787
298       13.963437      -13.284601       -8.718720        1.620467
299      -14.929180      -17.054152       26.186334        6.378173

[300 rows x 4 columns]


In [8]:
mordred_pca = pd.concat([Ligand_name_No, df_l, Substrate_name_No, df_s, Yield], axis=1, join='inner')
print(mordred_pca)
mordred_pca.to_csv('../data/mordred_pca.csv', index = False)

    Ligand_name Ligand_No  L_mordred_PC_0  L_mordred_PC_1  L_mordred_PC_2  \
0          dppm        L1       -0.033146        0.403049       -5.521747   
1          dppm        L1       -0.033146        0.403049       -5.521747   
2          dppm        L1       -0.033146        0.403049       -5.521747   
3          dppm        L1       -0.033146        0.403049       -5.521747   
4          dppm        L1       -0.033146        0.403049       -5.521747   
..          ...       ...             ...             ...             ...   
295       sphos       L30       -7.118382       -7.079659        0.265425   
296       sphos       L30       -7.118382       -7.079659        0.265425   
297       sphos       L30       -7.118382       -7.079659        0.265425   
298       sphos       L30       -7.118382       -7.079659        0.265425   
299       sphos       L30       -7.118382       -7.079659        0.265425   

     L_mordred_PC_3  L_mordred_PC_4  L_mordred_PC_5  L_mordred_PC_6  \
0   