In [1]:
#import
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import shap

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Reading dataset
df=pd.read_csv('RDKit.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Columns: 224 entries, Cate_name to Yield
dtypes: float64(177), int64(45), object(2)
memory usage: 119.1+ KB


In [3]:
#Building descriptors
X = df.drop(columns=['Yield', 'Cate_name', 'P_name' ])
print('---Descriptors---')
print(X.head())

y = pd.DataFrame(df['Yield'],columns=['Yield'])
y=y.astype(int)
print('---Objective---')
print(y.head())

---Descriptors---
   Cate_MaxAbsEStateIndex  Cate_MaxEStateIndex  Cate_MinAbsEStateIndex  \
0                8.669259             8.669259                0.076389   
1                8.669259             8.669259                0.076389   
2                8.669259             8.669259                0.076389   
3                8.669259             8.669259                0.076389   
4                8.850093             8.850093                0.060185   

   Cate_MinEStateIndex  Cate_qed  Cate_SPS  Cate_MolWt  Cate_HeavyAtomMolWt  \
0            -0.076389  0.490728  9.000000     110.112              104.064   
1            -0.076389  0.490728  9.000000     110.112              104.064   
2            -0.076389  0.490728  9.000000     110.112              104.064   
3            -0.076389  0.490728  9.000000     110.112              104.064   
4            -0.068889  0.513122  9.444444     124.139              116.075   

   Cate_ExactMolWt  Cate_NumValenceElectrons  ...  P_NumAromat

In [4]:
#storage 
data_r2_train =[]
data_RMSE_train = []
data_MAE_train = []
data_r2_test = []
data_RMSE_test = []
data_MAE_test = []
best_model_parametors = []
data_cv_score = []
cum_imp=pd.Series(index = X.columns)
cum_imp.fillna(0, inplace=True)

#RF
for i in range(10):
    seed=i
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
    
    param_grid = {"max_depth":[2,3,4,5],
                  'n_estimators':[1000,3000,5000]}
    reg_rf = GridSearchCV(RandomForestRegressor(random_state=0), param_grid=param_grid, cv=5, n_jobs=16)
    reg_rf.fit(X_train,y_train['Yield'])
    reg_best = reg_rf.best_estimator_
    y_pred1 = reg_best.predict(X_train)
    y_pred2 = reg_best.predict(X_test)
            
    #train
    r2_train = metrics.r2_score(y_train, y_pred1)
    RMSE_train = metrics.root_mean_squared_error(y_train, y_pred1)
    MAE_train =  metrics.mean_absolute_error(y_train, y_pred1)
    #test
    r2_test = metrics.r2_score(y_test, y_pred2)
    RMSE_test = metrics.root_mean_squared_error(y_test, y_pred2)
    MAE_test = metrics.mean_absolute_error(y_test, y_pred2)

    parametors = reg_rf.best_params_
    best_model_parametors.append(parametors)

    data_r2_train.append(r2_train)
    data_RMSE_train.append(RMSE_train)
    data_MAE_train.append(MAE_train)
    data_r2_test.append(r2_test)
    data_RMSE_test.append(RMSE_test)
    data_MAE_test.append(MAE_test)
    data_cv_score.append(reg_rf.best_score_)
    
    #cumulative importances 
    importances = pd.Series(reg_best.feature_importances_, index = X.columns)
    cum_imp += importances
    
    print('----------------------')
    print('seed:', seed)
    print("Best Model Parameter:",reg_rf.best_params_)
    print("Best Model Score:",reg_rf.best_score_)
    print('R2_test:', r2_test)
print('R2_train_means:', sum(data_r2_train)/10)
print('CV_score_means:', sum(data_cv_score)/10)
print('R2_test_means:', sum(data_r2_test)/10)

----------------------
seed: 0
Best Model Parameter: {'max_depth': 2, 'n_estimators': 5000}
Best Model Score: 0.7810042334358767
R2_test: 0.7438246286294927
----------------------
seed: 1
Best Model Parameter: {'max_depth': 4, 'n_estimators': 5000}
Best Model Score: 0.7422407238741837
R2_test: 0.8643148970133365
----------------------
seed: 2
Best Model Parameter: {'max_depth': 2, 'n_estimators': 5000}
Best Model Score: 0.7697174074410391
R2_test: 0.7839342797819121
----------------------
seed: 3
Best Model Parameter: {'max_depth': 2, 'n_estimators': 3000}
Best Model Score: 0.742724228668218
R2_test: 0.7923155886407727
----------------------
seed: 4
Best Model Parameter: {'max_depth': 5, 'n_estimators': 3000}
Best Model Score: 0.8074480901864242
R2_test: 0.752195588378892
----------------------
seed: 5
Best Model Parameter: {'max_depth': 5, 'n_estimators': 1000}
Best Model Score: 0.8021883641680319
R2_test: 0.8394722090723175
----------------------
seed: 6
Best Model Parameter: {'max_d

In [5]:
data_r2_train_pd = pd.DataFrame(data= data_r2_train, columns=['r2_train'])
data_RMSE_train_pd = pd.DataFrame(data=data_RMSE_train, columns=['RMSE_train'])
data_MAE_train_pd = pd.DataFrame(data=data_MAE_train, columns=['MAE_train'])
data_r2_test_pd = pd.DataFrame(data=data_r2_test, columns=['r2_test'])
data_RMSE_test_pd = pd.DataFrame(data=data_RMSE_test, columns=['RMSE_test'])
data_MAE_test_pd = pd.DataFrame(data=data_MAE_test, columns=['MAE_test'])
data_cv_score_pd = pd.DataFrame(data=data_cv_score, columns=['cv_score'])
data_parametors = pd.DataFrame([best_model_parametors])

data_all = pd.concat([data_r2_train_pd, data_RMSE_train_pd, data_MAE_train_pd, data_r2_test_pd, 
                      data_RMSE_test_pd, data_MAE_test_pd, data_cv_score_pd],
                     axis=1, join='inner')

data_all.loc['mean'] = data_all.mean()
print(data_all)

data_all.to_csv('../../score/RDKit/RF_score.csv')

      r2_train  RMSE_train  MAE_train   r2_test  RMSE_test   MAE_test  \
0     0.906690    8.294241   5.481987  0.743825  18.304770  11.221256   
1     0.952449    6.557473   4.082143  0.864315  11.465524   7.795438   
2     0.886733   10.236379   6.520482  0.783934  14.455061   9.320254   
3     0.884671   10.371519   6.515702  0.792316  14.118225   8.653687   
4     0.967423    5.299492   3.038033  0.752196  16.232574  11.004828   
5     0.966143    5.235979   2.841043  0.839472  13.972636   9.644930   
6     0.939543    7.043834   4.438955  0.812680  15.014389   9.791184   
7     0.901774    9.692919   6.199021  0.750354  15.043467   9.978662   
8     0.892729    9.716554   5.778109  0.766418  15.193241  11.222015   
9     0.863178   11.775474   7.877604  0.867749   9.653592   6.039210   
mean  0.916133    8.422386   5.277308  0.797326  14.345348   9.467146   

      cv_score  
0     0.781004  
1     0.742241  
2     0.769717  
3     0.742724  
4     0.807448  
5     0.802188  
6   