In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn import metrics

In [2]:
for d in ['RDKit_pca', 'mordred_pca', 'RDKit', 'mordred']:
    dataset = d
    print(dataset)
    data = pd.read_csv(f'data/data_{dataset}.csv')
    y = pd.DataFrame(data['Yield'],columns=['Yield'])
    X = data.drop(columns=['Name', 'ID', 'Yield'])

    r2_train = []
    r2_test = []
    mae_train = []
    mae_test = []
    for i in range(0,10):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=i)
        a_X_train = (X_train - X_train.mean()) / X_train.std()
        a_X_test = (X_test - X_train.mean()) / X_train.std()
        a_X_train = a_X_train.dropna(how='any', axis=1)
        a_X_test = a_X_test[a_X_train.columns]
        param = {'C':[1e+04, 1e+05, 1e+06, 1e+07, 1e+08, 1e+09],
                 'gamma':[1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 1e-04],
                 'epsilon':[0.1, 1, 10, 50, 100]}
        reg = GridSearchCV(SVR(kernel='rbf'), param_grid=param, cv=10, n_jobs=6)
        reg.fit(a_X_train,y_train['Yield'])
        best = reg.best_estimator_
        y_pred1 = best.predict(a_X_train)
        y_pred2 = best.predict(a_X_test)
        r2_train.append(metrics.r2_score(y_train, y_pred1))
        r2_test.append(metrics.r2_score(y_test, y_pred2))
        mae_train.append(metrics.mean_absolute_error(y_train, y_pred1))
        mae_test.append(metrics.mean_absolute_error(y_test, y_pred2))
    
    r2_train = pd.DataFrame(data=r2_train, columns=['r2_train'])
    r2_test = pd.DataFrame(data=r2_test, columns=['r2_test'])
    mae_train = pd.DataFrame(data=mae_train, columns=['mae_train'])
    mae_test = pd.DataFrame(data=mae_test, columns=['mae_test'])
    result = pd.concat([r2_train, r2_test, mae_train, mae_test], axis=1, join='inner')
    print(result)
    result.to_csv(f'result/SVM/result_{dataset}.csv')

RDKit_pca
   r2_train   r2_test  mae_train   mae_test
0  0.117892 -0.096050  12.094641  18.985322
1  0.785697 -1.464453  10.687954  22.151391
2  0.025484 -0.276389   9.793530  22.555143
3 -0.296692 -0.280356  16.983094  17.021059
4 -0.202723 -0.536898  10.986456  23.727288
5  0.740713  0.109239   9.832693  17.826574
6  0.016183 -0.106997  15.542452  16.040098
7 -0.248342 -0.377843  15.224069  18.984542
8 -0.395747 -0.251233  19.819539  15.099950
9  0.014506  0.034132  17.399023  13.185941
mordred_pca
   r2_train   r2_test  mae_train   mae_test
0 -0.025173 -0.136666  13.556202  19.459613
1  0.628173 -0.367139  16.174480  18.881147
2 -0.036164 -0.319352  10.047751  22.809872
3 -0.115584 -0.132559  16.228669  16.388549
4 -0.206024 -0.539850  10.993277  23.745573
5  0.676615  0.311877  10.888193  16.671522
6 -0.246778 -0.268134  17.582188  17.078984
7 -0.257474 -0.387162  15.329037  19.044774
8 -0.395804 -0.251276  19.819604  15.099858
9 -0.307877 -0.278183  20.254388  14.327741
RDKit
   r