In [1]:
#import
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn import metrics
import shap

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Reading dataset
df=pd.read_csv('DFT.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Cate_name          68 non-null     object 
 1   Cate_HOMO          68 non-null     float64
 2   Cate_LUMO          68 non-null     float64
 3   Cate_ESP_O1        68 non-null     float64
 4   Cate_ESP_H1        68 non-null     float64
 5   Cate_ESP_O2        68 non-null     float64
 6   Cate_ESP_H2        68 non-null     float64
 7   Cate_BL_ArO1       68 non-null     float64
 8   Cate_BL_O1H1       68 non-null     float64
 9   Cate_BL_ArO2       68 non-null     float64
 10  Cate_BL_O2H2       68 non-null     float64
 11  Cate_total_dipole  68 non-null     float64
 12  P_name             68 non-null     object 
 13  P_HOMO             68 non-null     float64
 14  P_LUMO             68 non-null     float64
 15  P_ESP_P            68 non-null     float64
 16  P_ESP_=O           68 non-nu

In [3]:
#Building descriptors
X = df.drop(columns=['Yield', 'Cate_name', 'P_name', 'P_total_dipole', 'Cate_total_dipole', 'Cate_BL_O1H1', 'Cate_BL_O2H2'])
print('---Descriptors---')
print(X.head())

y = pd.DataFrame(df['Yield'],columns=['Yield'])
print('---Objective---')
print(y.head())

---Descriptors---
   Cate_HOMO  Cate_LUMO  Cate_ESP_O1  Cate_ESP_H1  Cate_ESP_O2  Cate_ESP_H2  \
0      -5.63       0.21    -0.555170     0.441278    -0.594799     0.444770   
1      -5.63       0.21    -0.555170     0.441278    -0.594799     0.444770   
2      -5.63       0.21    -0.555170     0.441278    -0.594799     0.444770   
3      -5.63       0.21    -0.555170     0.441278    -0.594799     0.444770   
4      -5.51       0.22    -0.542028     0.433177    -0.582447     0.440296   

   Cate_BL_ArO1  Cate_BL_ArO2  P_HOMO  P_LUMO   P_ESP_P  P_ESP_=O   P_ESP_H  \
0         1.363         1.378   -7.85    1.77  0.997297 -0.600515 -0.035840   
1         1.363         1.378   -8.16    1.03  0.934101 -0.617890  0.007945   
2         1.363         1.378   -7.22   -0.99  0.742856 -0.578490  0.009025   
3         1.363         1.378   -7.85    0.75  0.788747 -0.578264 -0.038475   
4         1.364         1.379   -7.85    1.77  0.997297 -0.600515 -0.035840   

   P_BL_P=O  P_BL_PH  
0     1.4

In [4]:
#storage 
data_r2_train =[]
data_RMSE_train = []
data_MAE_train = []
data_r2_test = []
data_RMSE_test = []
data_MAE_test = []
best_model_parametors = []
data_cv_score = []

#MLP
for i in range(10):
    seed=i
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
    
    #autoscaling
    a_X_train = (X_train - X_train.mean(axis=0)) / X_train.std(axis=0, ddof=1)
    a_X_test = (X_test - X_train.mean(axis=0)) / X_train.std(axis=0, ddof=1)
    a_y_train = y_train/10
    
    param_mlp={'hidden_layer_sizes':[(64,),(128,),(256,),(512,)], 'alpha':[0,2,4]}
    reg_mlp = GridSearchCV(MLPRegressor(random_state=0, max_iter=1000, learning_rate_init=0.03),
                           param_grid=param_mlp, cv=5, n_jobs=16)
    reg_mlp.fit(a_X_train,a_y_train['Yield'])
    reg_best = reg_mlp.best_estimator_
    y_pred1 = reg_best.predict(a_X_train)*10
    y_pred2 = reg_best.predict(a_X_test)*10
            
    #train
    r2_train = metrics.r2_score(y_train, y_pred1)
    RMSE_train = metrics.root_mean_squared_error(y_train, y_pred1)
    MAE_train =  metrics.mean_absolute_error(y_train, y_pred1)
    
    #test
    r2_test = metrics.r2_score(y_test, y_pred2)
    RMSE_test = metrics.root_mean_squared_error(y_test, y_pred2)
    MAE_test = metrics.mean_absolute_error(y_test, y_pred2)
    
    #strage data of accuracy 
    parametors = reg_mlp.best_params_
    best_model_parametors.append(parametors)

    data_r2_train.append(r2_train)
    data_RMSE_train.append(RMSE_train)
    data_MAE_train.append(MAE_train)
    data_r2_test.append(r2_test)
    data_RMSE_test.append(RMSE_test)
    data_MAE_test.append(MAE_test)
    data_cv_score.append(reg_mlp.best_score_)
    
    print('----------------------')
    print('seed:', seed)
    print("Best Model Parameter:",reg_mlp.best_params_)
    print("Best Model Score:",reg_mlp.best_score_)
    print('R2_test:', r2_test)
print('R2_train_means:', sum(data_r2_train)/10)
print('CV_score_means:', sum(data_cv_score)/10)
print('R2_test_means:', sum(data_r2_test)/10)

----------------------
seed: 0
Best Model Parameter: {'alpha': 4, 'hidden_layer_sizes': (256,)}
Best Model Score: 0.7493539111332128
R2_test: 0.7367031585508654
----------------------
seed: 1
Best Model Parameter: {'alpha': 2, 'hidden_layer_sizes': (256,)}
Best Model Score: 0.7046852406862512
R2_test: 0.7220499915165235
----------------------
seed: 2
Best Model Parameter: {'alpha': 4, 'hidden_layer_sizes': (512,)}
Best Model Score: 0.808271532889208
R2_test: 0.7848877685719858
----------------------
seed: 3
Best Model Parameter: {'alpha': 2, 'hidden_layer_sizes': (512,)}
Best Model Score: 0.7415936573334314
R2_test: 0.8066537135062666
----------------------
seed: 4
Best Model Parameter: {'alpha': 4, 'hidden_layer_sizes': (256,)}
Best Model Score: 0.7265253042955377
R2_test: 0.8032594622447231
----------------------
seed: 5
Best Model Parameter: {'alpha': 4, 'hidden_layer_sizes': (256,)}
Best Model Score: 0.767038024478947
R2_test: 0.8066291305708888
----------------------
seed: 6
Best 

In [5]:
data_r2_train_pd = pd.DataFrame(data= data_r2_train, columns=['r2_train'])
data_RMSE_train_pd = pd.DataFrame(data=data_RMSE_train, columns=['RMSE_train'])
data_MAE_train_pd = pd.DataFrame(data=data_MAE_train, columns=['MAE_train'])
data_r2_test_pd = pd.DataFrame(data=data_r2_test, columns=['r2_test'])
data_RMSE_test_pd = pd.DataFrame(data=data_RMSE_test, columns=['RMSE_test'])
data_MAE_test_pd = pd.DataFrame(data=data_MAE_test, columns=['MAE_test'])
data_cv_score_pd = pd.DataFrame(data=data_cv_score, columns=['cv_score'])
data_parametors = pd.DataFrame([best_model_parametors])

data_all = pd.concat([data_r2_train_pd, data_RMSE_train_pd, data_MAE_train_pd, data_r2_test_pd, 
                      data_RMSE_test_pd, data_MAE_test_pd, data_cv_score_pd],
                     axis=1, join='inner')

data_all.loc['mean'] = data_all.mean()
data_all.loc['std'] = data_all.std(ddof=0)
print(data_all)

data_all.to_csv('../../score/DFT/MLP_score.csv')

      r2_train  RMSE_train  MAE_train   r2_test  RMSE_test   MAE_test  \
0     0.947886    6.198524   3.742637  0.736703  18.557455  13.028900   
1     0.977166    4.544082   2.763515  0.722050  16.410098  12.315052   
2     0.937673    7.593356   4.498600  0.784888  14.423131   9.636349   
3     0.964539    5.751065   3.140592  0.806654  13.622163   8.461150   
4     0.920643    8.271262   4.828707  0.803259  14.463713   8.983237   
5     0.895653    9.192043   5.170664  0.806629  15.335529  10.147432   
6     0.917372    8.234705   4.615484  0.800066  15.511676  10.472665   
7     0.943329    7.362453   4.752642  0.785563  13.942337   9.041449   
8     0.933352    7.658835   4.344947  0.842123  12.490779   8.857879   
9     0.922905    8.839217   5.220326  0.924138   7.311408   6.033028   
mean  0.936052    7.364554   4.307811  0.801207  14.206829   9.697714   
std   0.021524    1.319457   0.753508  0.050261   2.659035   1.792898   

      cv_score  
0     0.749354  
1     0.704685  