In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
def remove_all_zero_col(data):
    data = data.copy()
    for col in data.columns:
        if (data[col] == 0).all():
            data.drop(col, axis=1, inplace=True)
    return data

In [3]:
data = pd.read_csv("source/SMILES_yield.csv")
Yield = pd.DataFrame(data['Yield'], columns=['Yield'])
Cate_name = pd.DataFrame(data['Cate_name'], columns=['Cate_name'])
P_name = pd.DataFrame(data['P_name'], columns=['P_name'])

print(Yield)
print(Cate_name.head())
print(P_name.head())

    Yield
0      80
1       7
2       3
3      48
4      99
..    ...
63     51
64     44
65      5
66      3
67     57

[68 rows x 1 columns]
      Cate_name
0      Catechol
1      Catechol
2      Catechol
3      Catechol
4  4Me-Catechol
                  P_name
0     Dimethyl Phosphite
1        Phosphonic acid
2  Phenylphosphinic Acid
3        Phosphinic acid
4     Dimethyl Phosphite


In [4]:
# MF, Cate
mf_c = []
for i in range(0,68):
    mol_c = Chem.MolFromSmiles(data["Cate_SMILES"][i])
    fp_c = AllChem.GetMorganFingerprintAsBitVect(mol_c, 2, 2048)
    mf_c.append(np.array(fp_c, int))

df_c = pd.DataFrame(mf_c)
df_c = df_c.add_prefix('Cate_MF_')
print(df_c.shape)
df_2_c = remove_all_zero_col(df_c)
print(df_2_c.shape)
print(df_2_c)

(68, 2048)
(68, 119)
    Cate_MF_9  Cate_MF_33  Cate_MF_58  Cate_MF_68  Cate_MF_80  Cate_MF_102  \
0           0           0           0           0           0            0   
1           0           0           0           0           0            0   
2           0           0           0           0           0            0   
3           0           0           0           0           0            0   
4           0           0           0           0           0            0   
..        ...         ...         ...         ...         ...          ...   
63          0           0           0           0           0            0   
64          0           0           0           0           0            0   
65          0           0           0           0           0            0   
66          0           0           0           0           0            0   
67          0           0           0           0           0            0   

    Cate_MF_105  Cate_MF_114  Cate_MF_125 

In [5]:
# MF, P
mf_p = []
for i in range(0,68):
    mol_p = Chem.MolFromSmiles(data["P_SMILES"][i])
    fp_p = AllChem.GetMorganFingerprintAsBitVect(mol_p, 2, 2048)
    mf_p.append(np.array(fp_p, int))

df_p = pd.DataFrame(mf_p)
df_p = df_p.add_prefix('P_MF_')
print(df_p.shape)
df_2_p = remove_all_zero_col(df_p)
print(df_2_p.shape)
print(df_2_p)

(68, 2048)
(68, 28)
    P_MF_49  P_MF_179  P_MF_187  P_MF_292  P_MF_389  P_MF_418  P_MF_433  \
0         1         0         1         0         0         0         0   
1         1         1         0         0         0         0         0   
2         1         1         0         1         1         0         0   
3         0         0         0         0         0         1         1   
4         1         0         1         0         0         0         0   
..      ...       ...       ...       ...       ...       ...       ...   
63        0         0         0         0         0         1         1   
64        1         0         1         0         0         0         0   
65        1         1         0         0         0         0         0   
66        1         1         0         1         1         0         0   
67        0         0         0         0         0         1         1   

    P_MF_516  P_MF_555  P_MF_623  ...  P_MF_1154  P_MF_1199  P_MF_1272  \
0    

In [6]:
MF = pd.concat([Cate_name, df_2_c, P_name, df_2_p, Yield], axis=1, join='inner')
print(MF)
MF.to_csv('../Regression/MF/MF.csv', index = False)

        Cate_name  Cate_MF_9  Cate_MF_33  Cate_MF_58  Cate_MF_68  Cate_MF_80  \
0        Catechol          0           0           0           0           0   
1        Catechol          0           0           0           0           0   
2        Catechol          0           0           0           0           0   
3        Catechol          0           0           0           0           0   
4    4Me-Catechol          0           0           0           0           0   
..            ...        ...         ...         ...         ...         ...   
63    3F-Catechol          0           0           0           0           0   
64  3OMe-Catechol          0           0           0           0           0   
65  3OMe-Catechol          0           0           0           0           0   
66  3OMe-Catechol          0           0           0           0           0   
67  3OMe-Catechol          0           0           0           0           0   

    Cate_MF_102  Cate_MF_105  Cate_MF_1