In [1]:
#import
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn import metrics

In [2]:
#Reading dataset
df=pd.read_csv('RDKit.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Columns: 224 entries, Cate_name to Yield
dtypes: float64(177), int64(45), object(2)
memory usage: 119.1+ KB


In [3]:
#Building descriptors
X = df.drop(columns=['Yield', 'Cate_name', 'P_name' ])
print('---Descriptors---')
print(X.head())

y = pd.DataFrame(df['Yield'],columns=['Yield'])
print('---Objective---')
print(y.head())

---Descriptors---
   Cate_MaxAbsEStateIndex  Cate_MaxEStateIndex  Cate_MinAbsEStateIndex  \
0                8.669259             8.669259                0.076389   
1                8.669259             8.669259                0.076389   
2                8.669259             8.669259                0.076389   
3                8.669259             8.669259                0.076389   
4                8.850093             8.850093                0.060185   

   Cate_MinEStateIndex  Cate_qed  Cate_SPS  Cate_MolWt  Cate_HeavyAtomMolWt  \
0            -0.076389  0.490728  9.000000     110.112              104.064   
1            -0.076389  0.490728  9.000000     110.112              104.064   
2            -0.076389  0.490728  9.000000     110.112              104.064   
3            -0.076389  0.490728  9.000000     110.112              104.064   
4            -0.068889  0.513122  9.444444     124.139              116.075   

   Cate_ExactMolWt  Cate_NumValenceElectrons  ...  P_NumAromat

In [4]:
#storage 
data_r2_train =[]
data_RMSE_train = []
data_MAE_train = []
data_r2_test = []
data_RMSE_test = []
data_MAE_test = []
best_model_parametors = []
data_cv_score = []

#MLP
for i in range(10):
    seed=i
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
    
    #autoscaling
    a_X_train = (X_train - X_train.mean(axis=0)) / X_train.std(axis=0, ddof=1)
    a_X_test = (X_test - X_train.mean(axis=0)) / X_train.std(axis=0, ddof=1)
    a_X_train = a_X_train.dropna(how='any', axis=1)
    a_X_test = a_X_test[a_X_train.columns]
    a_y_train = y_train/10
    
    param_mlp={'hidden_layer_sizes':[(128,),(256,),(512,)], 'alpha':[1,2,3,4,5]}
    reg_mlp = GridSearchCV(MLPRegressor(random_state=0, max_iter=1000, learning_rate_init=0.03),
                           param_grid=param_mlp, cv=5, n_jobs=16)
    
    reg_mlp = GridSearchCV(MLPRegressor(random_state=0, max_iter=1000), param_grid=param_mlp, cv=5, n_jobs=16)
    reg_mlp.fit(a_X_train,a_y_train['Yield'])
    reg_best = reg_mlp.best_estimator_
    y_pred1 = reg_best.predict(a_X_train)*10
    y_pred2 = reg_best.predict(a_X_test)*10
            
    #train
    r2_train = metrics.r2_score(y_train, y_pred1)
    RMSE_train = metrics.mean_squared_error(y_train, y_pred1)
    MAE_train =  metrics.mean_absolute_error(y_train, y_pred1)
    
    #test
    r2_test = metrics.r2_score(y_test, y_pred2)
    RMSE_test = metrics.mean_squared_error(y_test, y_pred2)
    MAE_test = metrics.mean_absolute_error(y_test, y_pred2)
    
    #strage data of accuracy 
    parametors = reg_mlp.best_params_
    best_model_parametors.append(parametors)

    data_r2_train.append(r2_train)
    data_RMSE_train.append(RMSE_train)
    data_MAE_train.append(MAE_train)
    data_r2_test.append(r2_test)
    data_RMSE_test.append(RMSE_test)
    data_MAE_test.append(MAE_test)
    data_cv_score.append(reg_mlp.best_score_)
    
    print('----------------------')
    print('seed:', seed)
    print("Best Model Parameter:",reg_mlp.best_params_)
    print("Best Model Score:",reg_mlp.best_score_)
    print('R2_test:', r2_test)
print('R2_train_means:', sum(data_r2_train)/10)
print('CV_score_means:', sum(data_cv_score)/10)
print('R2_test_means:', sum(data_r2_test)/10)

----------------------
seed: 0
Best Model Parameter: {'alpha': 2, 'hidden_layer_sizes': (512,)}
Best Model Score: 0.7095534647149335
R2_test: 0.803648847108649
----------------------
seed: 1
Best Model Parameter: {'alpha': 4, 'hidden_layer_sizes': (512,)}
Best Model Score: 0.7948716592360008
R2_test: 0.8555213869500844
----------------------
seed: 2
Best Model Parameter: {'alpha': 1, 'hidden_layer_sizes': (128,)}
Best Model Score: 0.7059326545556419
R2_test: 0.8800508379244681
----------------------
seed: 3
Best Model Parameter: {'alpha': 3, 'hidden_layer_sizes': (128,)}
Best Model Score: 0.6094547929579092
R2_test: 0.873114260702831
----------------------
seed: 4
Best Model Parameter: {'alpha': 5, 'hidden_layer_sizes': (512,)}
Best Model Score: 0.8255679966590833
R2_test: 0.8423728777833199
----------------------
seed: 5
Best Model Parameter: {'alpha': 5, 'hidden_layer_sizes': (512,)}
Best Model Score: 0.604140772713259
R2_test: 0.907516740265156
----------------------
seed: 6
Best Mo

In [5]:
data_r2_train_pd = pd.DataFrame(data= data_r2_train, columns=['r2_train'])
data_RMSE_train_pd = pd.DataFrame(data=data_RMSE_train, columns=['RMSE_train'])
data_MAE_train_pd = pd.DataFrame(data=data_MAE_train, columns=['MAE_train'])
data_r2_test_pd = pd.DataFrame(data=data_r2_test, columns=['r2_test'])
data_RMSE_test_pd = pd.DataFrame(data=data_RMSE_test, columns=['RMSE_test'])
data_MAE_test_pd = pd.DataFrame(data=data_MAE_test, columns=['MAE_test'])
data_cv_score_pd = pd.DataFrame(data=data_cv_score, columns=['cv_score'])
data_parametors = pd.DataFrame([best_model_parametors])

data_all = pd.concat([data_r2_train_pd, data_RMSE_train_pd, data_MAE_train_pd, data_r2_test_pd, 
                      data_RMSE_test_pd, data_MAE_test_pd, data_cv_score_pd],
                     axis=1, join='inner')

data_all.loc['mean'] = data_all.mean()
print(data_all)

data_all.to_csv('../../score/RDKit/MLP_score.csv')

      r2_train  RMSE_train  MAE_train   r2_test   RMSE_test   MAE_test  \
0     0.999628    0.274619   0.334818  0.803649  256.817513  11.016055   
1     0.998825    1.062382   0.621166  0.855521  139.977826   8.058843   
2     0.999890    0.101532   0.183557  0.880051  115.998184   7.588457   
3     0.999200    0.746476   0.583896  0.873114  121.778085   8.761262   
4     0.998042    1.688181   0.826486  0.842373  167.608744   9.685449   
5     0.997301    2.185340   0.778448  0.907517  112.478518   6.894011   
6     0.999163    0.687122   0.452384  0.808018  231.042757  10.512624   
7     0.999371    0.601264   0.452399  0.773369  205.442636   9.535373   
8     0.997668    2.052464   0.824259  0.783805  213.651268  11.060642   
9     0.998517    1.502714   0.866012  0.898578   71.467768   6.361796   
mean  0.998760    1.090209   0.592342  0.842600  163.626330   8.947451   

      cv_score  
0     0.709553  
1     0.794872  
2     0.705933  
3     0.609455  
4     0.825568  
5     0.6

In [6]:
#Reading dataset
df=pd.read_csv('RDKit_pca.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Cate_name        68 non-null     object 
 1   Cate_RDKit_PC_0  68 non-null     float64
 2   Cate_RDKit_PC_1  68 non-null     float64
 3   Cate_RDKit_PC_2  68 non-null     float64
 4   Cate_RDKit_PC_3  68 non-null     float64
 5   Cate_RDKit_PC_4  68 non-null     float64
 6   Cate_RDKit_PC_5  68 non-null     float64
 7   Cate_RDKit_PC_6  68 non-null     float64
 8   P_name           68 non-null     object 
 9   P_RDKit_PC_0     68 non-null     float64
 10  P_RDKit_PC_1     68 non-null     float64
 11  Yield            68 non-null     int64  
dtypes: float64(9), int64(1), object(2)
memory usage: 6.5+ KB


In [7]:
X = df.drop(columns=['Yield', 'Cate_name', 'P_name' ])
print('---Descriptors---')
print(X.head())

y = pd.DataFrame(df['Yield'],columns=['Yield'])
print('---Objective---')
print(y.head())

---Descriptors---
   Cate_RDKit_PC_0  Cate_RDKit_PC_1  Cate_RDKit_PC_2  Cate_RDKit_PC_3  \
0        -5.830571        -4.385613          0.82493         0.507248   
1        -5.830571        -4.385613          0.82493         0.507248   
2        -5.830571        -4.385613          0.82493         0.507248   
3        -5.830571        -4.385613          0.82493         0.507248   
4        -3.100537        -3.269828         -0.52885        -1.363916   

   Cate_RDKit_PC_4  Cate_RDKit_PC_5  Cate_RDKit_PC_6  P_RDKit_PC_0  \
0        -0.510793        -1.092656        -0.605579      1.406982   
1        -0.510793        -1.092656        -0.605579     -5.494109   
2        -0.510793        -1.092656        -0.605579     11.128658   
3        -0.510793        -1.092656        -0.605579     -7.041532   
4         1.881023        -0.952011        -1.287595      1.406982   

   P_RDKit_PC_1  
0      7.906757  
1      0.842109  
2     -3.748087  
3     -5.000780  
4      7.906757  
---Objective--

In [8]:
#storage 
data_r2_train =[]
data_RMSE_train = []
data_MAE_train = []
data_r2_test = []
data_RMSE_test = []
data_MAE_test = []
best_model_parametors = []
data_cv_score = []

#MLP
for i in range(10):
    seed=i
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
    
    #autoscaling
    a_X_train = (X_train - X_train.mean(axis=0)) / X_train.std(axis=0, ddof=1)
    a_X_test = (X_test - X_train.mean(axis=0)) / X_train.std(axis=0, ddof=1)
    a_y_train = y_train/10
    
    param_mlp={'hidden_layer_sizes':[(128,),(256,),(512,)], 'alpha':[1,2,3,4,5]}
    reg_mlp = GridSearchCV(MLPRegressor(random_state=0, max_iter=1000, learning_rate_init=0.03),
                           param_grid=param_mlp, cv=5, n_jobs=16)
    
    reg_mlp = GridSearchCV(MLPRegressor(random_state=0, max_iter=1000), param_grid=param_mlp, cv=5, n_jobs=16)
    reg_mlp.fit(a_X_train,a_y_train['Yield'])
    reg_best = reg_mlp.best_estimator_
    y_pred1 = reg_best.predict(a_X_train)*10
    y_pred2 = reg_best.predict(a_X_test)*10
            
    #train
    r2_train = metrics.r2_score(y_train, y_pred1)
    RMSE_train = metrics.mean_squared_error(y_train, y_pred1)
    MAE_train =  metrics.mean_absolute_error(y_train, y_pred1)
    
    #test
    r2_test = metrics.r2_score(y_test, y_pred2)
    RMSE_test = metrics.mean_squared_error(y_test, y_pred2)
    MAE_test = metrics.mean_absolute_error(y_test, y_pred2)
    
    #strage data of accuracy 
    parametors = reg_mlp.best_params_
    best_model_parametors.append(parametors)

    data_r2_train.append(r2_train)
    data_RMSE_train.append(RMSE_train)
    data_MAE_train.append(MAE_train)
    data_r2_test.append(r2_test)
    data_RMSE_test.append(RMSE_test)
    data_MAE_test.append(MAE_test)
    data_cv_score.append(reg_mlp.best_score_)
    
    print('----------------------')
    print('seed:', seed)
    print("Best Model Parameter:",reg_mlp.best_params_)
    print("Best Model Score:",reg_mlp.best_score_)
    print('R2_test:', r2_test)
print('R2_train_means:', sum(data_r2_train)/10)
print('CV_score_means:', sum(data_cv_score)/10)
print('R2_test_means:', sum(data_r2_test)/10)

----------------------
seed: 0
Best Model Parameter: {'alpha': 5, 'hidden_layer_sizes': (128,)}
Best Model Score: 0.684263814019137
R2_test: 0.7502273547628397
----------------------
seed: 1
Best Model Parameter: {'alpha': 3, 'hidden_layer_sizes': (512,)}
Best Model Score: 0.748524664660753
R2_test: 0.2629073635753312
----------------------
seed: 2
Best Model Parameter: {'alpha': 5, 'hidden_layer_sizes': (512,)}
Best Model Score: 0.6561074865165286
R2_test: 0.7317320300215711
----------------------
seed: 3
Best Model Parameter: {'alpha': 5, 'hidden_layer_sizes': (512,)}
Best Model Score: 0.4571093213500054
R2_test: 0.7786821778277733
----------------------
seed: 4
Best Model Parameter: {'alpha': 3, 'hidden_layer_sizes': (256,)}
Best Model Score: 0.6910260465810082
R2_test: 0.86062129492052
----------------------
seed: 5
Best Model Parameter: {'alpha': 5, 'hidden_layer_sizes': (256,)}
Best Model Score: 0.6123414724512283
R2_test: 0.8080876203452384
----------------------
seed: 6
Best Mo

In [9]:
data_r2_train_pd = pd.DataFrame(data= data_r2_train, columns=['r2_train'])
data_RMSE_train_pd = pd.DataFrame(data=data_RMSE_train, columns=['RMSE_train'])
data_MAE_train_pd = pd.DataFrame(data=data_MAE_train, columns=['MAE_train'])
data_r2_test_pd = pd.DataFrame(data=data_r2_test, columns=['r2_test'])
data_RMSE_test_pd = pd.DataFrame(data=data_RMSE_test, columns=['RMSE_test'])
data_MAE_test_pd = pd.DataFrame(data=data_MAE_test, columns=['MAE_test'])
data_cv_score_pd = pd.DataFrame(data=data_cv_score, columns=['cv_score'])
data_parametors = pd.DataFrame([best_model_parametors])

data_all = pd.concat([data_r2_train_pd, data_RMSE_train_pd, data_MAE_train_pd, data_r2_test_pd, 
                      data_RMSE_test_pd, data_MAE_test_pd, data_cv_score_pd],
                     axis=1, join='inner')

data_all.loc['mean'] = data_all.mean()
print(data_all)

data_all.to_csv('../../score/RDKit/MLP_pca_score.csv')

      r2_train  RMSE_train  MAE_train   r2_test   RMSE_test   MAE_test  \
0     0.937208   46.294050   4.736119  0.750227  326.690160  12.160181   
1     0.970641   26.549423   3.345318  0.262907  714.130780  17.998595   
2     0.920040   73.970967   5.164381  0.731732  259.431552  12.782907   
3     0.934364   61.219543   5.340950  0.778682  212.408902  10.827879   
4     0.951146   42.116738   4.014535  0.860621  148.204759   7.986528   
5     0.896636   83.697915   5.588098  0.808088  233.404619   9.844954   
6     0.928653   58.552543   4.433591  0.837441  195.633229   9.583226   
7     0.946019   51.632856   4.738134  0.735546  239.729969  11.968462   
8     0.930204   61.428673   4.905814  0.801270  196.392372   9.929878   
9     0.930344   70.592073   5.906602  0.660806  239.015846  10.700600   
mean  0.934526   57.605478   4.817354  0.722732  276.504219  11.378321   

      cv_score  
0     0.684264  
1     0.748525  
2     0.656107  
3     0.457109  
4     0.691026  
5     0.6