In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import Descriptors
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
def remove_all_zero_col(data):
    data = data.copy()
    for col in data.columns:
        if (data[col] == 0).all():
            data.drop(col, axis=1, inplace=True)
    return data

In [3]:
data = pd.read_csv("source/SMILES_yield.csv")
Yield = pd.DataFrame(data['Yield'], columns=['Yield'])
Cate_name = pd.DataFrame(data['Cate_name'], columns=['Cate_name'])
P_name = pd.DataFrame(data['P_name'], columns=['P_name'])
df_Cate = pd.DataFrame(data['Cate_SMILES'], columns=['Cate_SMILES'])
df_P = pd.DataFrame(data['P_SMILES'], columns=['P_SMILES'])

print(Yield.head())
print(Cate_name.head())
print(P_name.head())
print(df_Cate.head())
print(df_P.head())

   Yield
0     80
1      7
2      3
3     48
4     99
      Cate_name
0      Catechol
1      Catechol
2      Catechol
3      Catechol
4  4Me-Catechol
                  P_name
0     Dimethyl Phosphite
1        Phosphonic acid
2  Phenylphosphinic Acid
3        Phosphinic acid
4     Dimethyl Phosphite
        Cate_SMILES
0     OC1=CC=CC=C1O
1     OC1=CC=CC=C1O
2     OC1=CC=CC=C1O
3     OC1=CC=CC=C1O
4  OC1=CC=C(C)C=C1O
                 P_SMILES
0          O=P([H])(OC)OC
1            [H]P(O)(O)=O
2  [H]P(O)(C1=CC=CC=C1)=O
3          [H]P([H])(O)=O
4          O=P([H])(OC)OC


In [4]:
# RDKit, Cate
PandasTools.AddMoleculeColumnToFrame(df_Cate,'Cate_SMILES')
for i,j in Descriptors.descList:
    df_Cate[i] = df_Cate.ROMol.map(j)
df_Cate = df_Cate.add_prefix('Cate_')

df_c = df_Cate.drop(columns=['Cate_Cate_SMILES', 'Cate_ROMol'])
print(df_c.shape)
df_2_c = remove_all_zero_col(df_c)
df_2_c = df_2_c.dropna(how='any', axis=1)
print(df_2_c.shape)
print(df_2_c)

(68, 210)
(68, 130)
    Cate_MaxAbsEStateIndex  Cate_MaxEStateIndex  Cate_MinAbsEStateIndex  \
0                 8.669259             8.669259                0.076389   
1                 8.669259             8.669259                0.076389   
2                 8.669259             8.669259                0.076389   
3                 8.669259             8.669259                0.076389   
4                 8.850093             8.850093                0.060185   
..                     ...                  ...                     ...   
63               12.169537            12.169537                0.435185   
64                9.022546             9.022546                0.167269   
65                9.022546             9.022546                0.167269   
66                9.022546             9.022546                0.167269   
67                9.022546             9.022546                0.167269   

    Cate_MinEStateIndex  Cate_qed  Cate_SPS  Cate_MolWt  Cate_HeavyAtomMolWt  \

  df_Cate[i] = df_Cate.ROMol.map(j)
  df_Cate[i] = df_Cate.ROMol.map(j)
  df_Cate[i] = df_Cate.ROMol.map(j)
  df_Cate[i] = df_Cate.ROMol.map(j)
  df_Cate[i] = df_Cate.ROMol.map(j)
  df_Cate[i] = df_Cate.ROMol.map(j)
  df_Cate[i] = df_Cate.ROMol.map(j)
  df_Cate[i] = df_Cate.ROMol.map(j)
  df_Cate[i] = df_Cate.ROMol.map(j)
  df_Cate[i] = df_Cate.ROMol.map(j)
  df_Cate[i] = df_Cate.ROMol.map(j)
  df_Cate[i] = df_Cate.ROMol.map(j)
  df_Cate[i] = df_Cate.ROMol.map(j)
  df_Cate[i] = df_Cate.ROMol.map(j)
  df_Cate[i] = df_Cate.ROMol.map(j)
  df_Cate[i] = df_Cate.ROMol.map(j)
  df_Cate[i] = df_Cate.ROMol.map(j)
  df_Cate[i] = df_Cate.ROMol.map(j)
  df_Cate[i] = df_Cate.ROMol.map(j)
  df_Cate[i] = df_Cate.ROMol.map(j)
  df_Cate[i] = df_Cate.ROMol.map(j)
  df_Cate[i] = df_Cate.ROMol.map(j)
  df_Cate[i] = df_Cate.ROMol.map(j)
  df_Cate[i] = df_Cate.ROMol.map(j)
  df_Cate[i] = df_Cate.ROMol.map(j)
  df_Cate[i] = df_Cate.ROMol.map(j)
  df_Cate[i] = df_Cate.ROMol.map(j)
  df_Cate[i] = df_Cate.ROMol

In [5]:
# RDKit, P
PandasTools.AddMoleculeColumnToFrame(df_P,'P_SMILES')
for i,j in Descriptors.descList:
    df_P[i] = df_P.ROMol.map(j)
df_P = df_P.add_prefix('P_')

df_p = df_P.drop(columns=['P_P_SMILES', 'P_ROMol'])
print(df_p.shape)
df_2_p = remove_all_zero_col(df_p)
print(df_2_p.shape)
print(df_2_p)

(68, 210)
(68, 91)
    P_MaxAbsEStateIndex  P_MaxEStateIndex  P_MinAbsEStateIndex  \
0              9.921296          9.921296             1.338092   
1              8.740741          8.740741             3.129630   
2             10.386111         10.386111             0.516204   
3              8.569444          8.569444             1.500000   
4              9.921296          9.921296             1.338092   
..                  ...               ...                  ...   
63             8.569444          8.569444             1.500000   
64             9.921296          9.921296             1.338092   
65             8.740741          8.740741             3.129630   
66            10.386111         10.386111             0.516204   
67             8.569444          8.569444             1.500000   

    P_MinEStateIndex     P_qed      P_SPS  P_MolWt  P_HeavyAtomMolWt  \
0          -2.118313  0.489807   9.833333  110.049           102.993   
1          -3.129630  0.381969   8.750000   

  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] = df_P.ROMol.map(j)
  df_P[i] 

In [6]:
# PCA, Cate
X_c = np.array(df_2_c)
sc = StandardScaler()
X_sc_c = sc.fit(X_c).transform(X_c)
pca_c = PCA(n_components=7)
X_pca_c = pca_c.fit(X_sc_c).transform(X_sc_c)
print("Before: {}".format(str(X_sc_c.shape)))
print("After: {}".format(str(X_pca_c.shape)))
print('sum of explained variance ratio: {0}'.format(sum(pca_c.explained_variance_ratio_)))
df_c = pd.DataFrame(X_pca_c)
df_c = df_c.add_prefix('Cate_RDKit_PC_')
print(df_c)

Before: (68, 130)
After: (68, 7)
sum of explained variance ratio: 0.8118397913669639
    Cate_RDKit_PC_0  Cate_RDKit_PC_1  Cate_RDKit_PC_2  Cate_RDKit_PC_3  \
0         -5.830571        -4.385613         0.824930         0.507248   
1         -5.830571        -4.385613         0.824930         0.507248   
2         -5.830571        -4.385613         0.824930         0.507248   
3         -5.830571        -4.385613         0.824930         0.507248   
4         -3.100537        -3.269828        -0.528850        -1.363916   
..              ...              ...              ...              ...   
63        -5.355253         1.564946         4.710130        -3.768274   
64        -2.242624         1.843666        -4.840806         0.110269   
65        -2.242624         1.843666        -4.840806         0.110269   
66        -2.242624         1.843666        -4.840806         0.110269   
67        -2.242624         1.843666        -4.840806         0.110269   

    Cate_RDKit_PC_4  Cate_

In [7]:
# PCA, P
X_p = np.array(df_2_p)
X_sc_p = sc.fit(X_p).transform(X_p)
pca_p = PCA(n_components=2)
X_pca_p = pca_p.fit(X_sc_p).transform(X_sc_p)
print("Before: {}".format(str(X_sc_p.shape)))
print("After: {}".format(str(X_pca_p.shape)))
print('sum of explained variance ratio: {0}'.format(sum(pca_p.explained_variance_ratio_)))
df_p = pd.DataFrame(X_pca_p)
df_p = df_p.add_prefix('P_RDKit_PC_')
print(df_p)

Before: (68, 91)
After: (68, 2)
sum of explained variance ratio: 0.8552137535818984
    P_RDKit_PC_0  P_RDKit_PC_1
0       1.406982      7.906757
1      -5.494109      0.842109
2      11.128658     -3.748087
3      -7.041532     -5.000780
4       1.406982      7.906757
..           ...           ...
63     -7.041532     -5.000780
64      1.406982      7.906757
65     -5.494109      0.842109
66     11.128658     -3.748087
67     -7.041532     -5.000780

[68 rows x 2 columns]


In [8]:
RDKit = pd.concat([Cate_name, df_2_c, P_name, df_2_p, Yield], axis=1, join='inner')
print(RDKit)
RDKit.to_csv('../Regression/RDKit/RDKit.csv', index = False)

        Cate_name  Cate_MaxAbsEStateIndex  Cate_MaxEStateIndex  \
0        Catechol                8.669259             8.669259   
1        Catechol                8.669259             8.669259   
2        Catechol                8.669259             8.669259   
3        Catechol                8.669259             8.669259   
4    4Me-Catechol                8.850093             8.850093   
..            ...                     ...                  ...   
63    3F-Catechol               12.169537            12.169537   
64  3OMe-Catechol                9.022546             9.022546   
65  3OMe-Catechol                9.022546             9.022546   
66  3OMe-Catechol                9.022546             9.022546   
67  3OMe-Catechol                9.022546             9.022546   

    Cate_MinAbsEStateIndex  Cate_MinEStateIndex  Cate_qed  Cate_SPS  \
0                 0.076389            -0.076389  0.490728  9.000000   
1                 0.076389            -0.076389  0.490728  9.0000

In [9]:
RDKit_pca = pd.concat([Cate_name, df_c, P_name, df_p, Yield], axis=1, join='inner')
print(RDKit_pca)
RDKit_pca.to_csv('../Regression/RDKit/RDKit_pca.csv', index = False)

        Cate_name  Cate_RDKit_PC_0  Cate_RDKit_PC_1  Cate_RDKit_PC_2  \
0        Catechol        -5.830571        -4.385613         0.824930   
1        Catechol        -5.830571        -4.385613         0.824930   
2        Catechol        -5.830571        -4.385613         0.824930   
3        Catechol        -5.830571        -4.385613         0.824930   
4    4Me-Catechol        -3.100537        -3.269828        -0.528850   
..            ...              ...              ...              ...   
63    3F-Catechol        -5.355253         1.564946         4.710130   
64  3OMe-Catechol        -2.242624         1.843666        -4.840806   
65  3OMe-Catechol        -2.242624         1.843666        -4.840806   
66  3OMe-Catechol        -2.242624         1.843666        -4.840806   
67  3OMe-Catechol        -2.242624         1.843666        -4.840806   

    Cate_RDKit_PC_3  Cate_RDKit_PC_4  Cate_RDKit_PC_5  Cate_RDKit_PC_6  \
0          0.507248        -0.510793        -1.092656        