In [1]:
#import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn import metrics

In [2]:
#Reading dataset
df=pd.read_csv('RDKit.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Columns: 224 entries, Cate_name to Yield
dtypes: float64(177), int64(45), object(2)
memory usage: 119.1+ KB


In [3]:
#Building descriptors
X = df.drop(columns=['Yield', 'Cate_name', 'P_name' ])
print('---Descriptors---')
print(X.head())

y = pd.DataFrame(df['Yield'],columns=['Yield'])
print('---Objective---')
print(y.head())

---Descriptors---
   Cate_MaxAbsEStateIndex  Cate_MaxEStateIndex  Cate_MinAbsEStateIndex  \
0                8.669259             8.669259                0.076389   
1                8.669259             8.669259                0.076389   
2                8.669259             8.669259                0.076389   
3                8.669259             8.669259                0.076389   
4                8.850093             8.850093                0.060185   

   Cate_MinEStateIndex  Cate_qed  Cate_SPS  Cate_MolWt  Cate_HeavyAtomMolWt  \
0            -0.076389  0.490728  9.000000     110.112              104.064   
1            -0.076389  0.490728  9.000000     110.112              104.064   
2            -0.076389  0.490728  9.000000     110.112              104.064   
3            -0.076389  0.490728  9.000000     110.112              104.064   
4            -0.068889  0.513122  9.444444     124.139              116.075   

   Cate_ExactMolWt  Cate_NumValenceElectrons  ...  P_NumAromat

In [4]:
#storage 
data_r2_train =[]
data_RMSE_train = []
data_MAE_train = []
data_r2_test = []
data_RMSE_test = []
data_MAE_test = []
best_model_parametors = []
data_cv_score = []

#SVM_a_X,log(y)
for i in range(10):
    seed=i
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
    
    #autoscaling
    a_X_train = (X_train - X_train.mean(axis=0)) / X_train.std(axis=0, ddof=1)
    a_X_test = (X_test - X_train.mean(axis=0)) / X_train.std(axis=0, ddof=1)
    a_X_train = a_X_train.dropna(how='any', axis=1)
    a_X_test = a_X_test[a_X_train.columns]
    
    log_y_train = np.log(y_train)
    
    param_svm={'C':[0.1,1,5,10,50,100,200,300,400,500,750,1000],
               'gamma':[100,10,1,0.1,0.01,0.001,0.0001,0.00001]}
    reg_svm = GridSearchCV(SVR(kernel='rbf', epsilon=0.3), param_grid=param_svm, cv=5, n_jobs=16)
    reg_svm.fit(a_X_train,log_y_train['Yield'])
    reg_best = reg_svm.best_estimator_
    y_pred1 = np.exp(reg_best.predict(a_X_train))
    y_pred2 = np.exp(reg_best.predict(a_X_test))
    
    #train
    r2_train = metrics.r2_score(y_train, y_pred1)
    RMSE_train = metrics.mean_squared_error(y_train, y_pred1)
    MAE_train =  metrics.mean_absolute_error(y_train, y_pred1)
    #test
    r2_test = metrics.r2_score(y_test, y_pred2)
    RMSE_test = metrics.mean_squared_error(y_test, y_pred2)
    MAE_test = metrics.mean_absolute_error(y_test, y_pred2)
    
    parametors = reg_svm.best_params_
    best_model_parametors.append(parametors)
    cv_score = reg_svm.best_score_
    data_cv_score.append(cv_score)
    
    data_r2_train.append(r2_train)
    data_RMSE_train.append(RMSE_train)
    data_MAE_train.append(MAE_train)
    data_r2_test.append(r2_test)
    data_RMSE_test.append(RMSE_test)
    data_MAE_test.append(MAE_test)
    
    print('----------------------')
    print('seed:', seed)
    print("Best Model Parameter:",reg_svm.best_params_)
    print("Best Model Score:",reg_svm.best_score_)
    print('R2_test:', r2_test)
print('R2_train_means:', sum(data_r2_train)/10)
print('CV_score_means:', sum(data_cv_score)/10)
print('R2_test_means:', sum(data_r2_test)/10)

----------------------
seed: 0
Best Model Parameter: {'C': 200, 'gamma': 0.0001}
Best Model Score: 0.9275469855876644
R2_test: 0.7080376993441224
----------------------
seed: 1
Best Model Parameter: {'C': 100, 'gamma': 0.0001}
Best Model Score: 0.905909536355266
R2_test: 0.8043092061652809
----------------------
seed: 2
Best Model Parameter: {'C': 1000, 'gamma': 1e-05}
Best Model Score: 0.9296253357889354
R2_test: 0.8231440540402852
----------------------
seed: 3
Best Model Parameter: {'C': 1000, 'gamma': 1e-05}
Best Model Score: 0.9040806227813736
R2_test: 0.8344718702887974
----------------------
seed: 4
Best Model Parameter: {'C': 1000, 'gamma': 1e-05}
Best Model Score: 0.9229766321107054
R2_test: 0.8058024479829748
----------------------
seed: 5
Best Model Parameter: {'C': 750, 'gamma': 1e-05}
Best Model Score: 0.8900917903567507
R2_test: 0.664783377912576
----------------------
seed: 6
Best Model Parameter: {'C': 1000, 'gamma': 1e-05}
Best Model Score: 0.8950173758076906
R2_test: 

In [5]:
data_r2_train_pd = pd.DataFrame(data= data_r2_train, columns=['r2_train'])
data_RMSE_train_pd = pd.DataFrame(data=data_RMSE_train, columns=['RMSE_train'])
data_MAE_train_pd = pd.DataFrame(data=data_MAE_train, columns=['MAE_train'])
data_r2_test_pd = pd.DataFrame(data=data_r2_test, columns=['r2_test'])
data_RMSE_test_pd = pd.DataFrame(data=data_RMSE_test, columns=['RMSE_test'])
data_MAE_test_pd = pd.DataFrame(data=data_MAE_test, columns=['MAE_test'])
data_cv_score_pd = pd.DataFrame(data=data_cv_score, columns=['cv_score'])
data_parametors = pd.DataFrame([best_model_parametors])

data_all = pd.concat([data_r2_train_pd, data_RMSE_train_pd, data_MAE_train_pd, data_r2_test_pd, 
                      data_RMSE_test_pd, data_MAE_test_pd, data_cv_score_pd],
                     axis=1, join='inner')

data_all.loc['mean'] = data_all.mean()
print(data_all)

data_all.to_csv('../../score/RDKit//SVM_score.csv')

      r2_train  RMSE_train  MAE_train   r2_test   RMSE_test   MAE_test  \
0     0.902478   71.899679   5.286805  0.708038  381.872124  12.119496   
1     0.859991  126.611468   7.050066  0.804309  189.594648   7.990399   
2     0.860510  129.042376   7.063012  0.823144  171.030528   8.378744   
3     0.861511  129.169909   7.202028  0.834472  158.864966   7.842139   
4     0.885987   98.290247   6.302610  0.805802  206.494969   9.032207   
5     0.828584  138.802511   6.294612  0.664783  407.691824  12.627469   
6     0.871162  105.734270   5.994857  0.743104  309.164257  10.242891   
7     0.901593   94.126117   6.549893  0.776449  202.651135   9.424225   
8     0.890335   96.517960   6.155010  0.689916  306.436570  12.089595   
9     0.858879  143.017775   8.122859  0.864392   95.557004   6.057178   
mean  0.872103  113.321231   6.602175  0.771441  242.935803   9.580434   

      cv_score  
0     0.927547  
1     0.905910  
2     0.929625  
3     0.904081  
4     0.922977  
5     0.8