In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [2]:
for dataset in ['DFT', 'MF_pca', 'MK_pca', 'RDKit_pca', 'mordred_pca', 'MF', 'MK', 'RDKit', 'mordred']:
    data = pd.read_csv(f'data/data_{dataset}.csv')
    y = pd.DataFrame(data['Yield'],columns=['Yield'])
    X = data.drop(columns=['Name', 'ID', 'Yield'])
    
    data_s = pd.read_csv(f'data/data_{dataset}_ALL.csv')
    y_s = pd.DataFrame(data_s['Yield'],columns=['Yield'])
    X_s = data_s.drop(columns=['Name', 'ID', 'Yield'])
    scaler = StandardScaler()
    a_X_s = scaler.fit_transform(X_s)

    r2_train = []
    r2_test = []
    for i in range(100):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=i)
        a_X_test = scaler.transform(X_test)
        param = {'C':[0.01, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000, 5000, 10000],
             'gamma':[1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 10, 50, 100, 500, 1000],
             'epsilon':[1e-05, 0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 10]}
        reg = GridSearchCV(SVR(kernel='rbf'), param_grid=param, cv=5, n_jobs=16)
        reg.fit(a_X_s,y_s['Yield'])
        best = reg.best_estimator_
        y_pred1 = best.predict(a_X_s)
        y_pred2 = best.predict(a_X_test)
        r2_train.append(metrics.r2_score(y_s, y_pred1))
        r2_test.append(metrics.r2_score(y_test, y_pred2))  

    r2_train = pd.DataFrame(data=r2_train, columns=['r2_train'])
    r2_test = pd.DataFrame(data=r2_test, columns=['r2_test'])
    result = pd.concat([r2_train, r2_test], axis=1, join='inner')
    result.to_csv(f'result/result_{dataset}.csv', index = False)

In [3]:
for re in['DFT', 'MF_pca', 'MK_pca', 'RDKit_pca', 'mordred_pca', 'MF', 'MK', 'RDKit', 'mordred']:
    res = pd.read_csv(f'result/result_{re}.csv')
    print(f'Dataset: {re}')
    print(res.mean())
    print('=======================')

Dataset: DFT
r2_train    0.597999
r2_test    -0.191489
dtype: float64
Dataset: MF_pca
r2_train    0.858233
r2_test     0.090725
dtype: float64
Dataset: MK_pca
r2_train    0.841092
r2_test    -0.082867
dtype: float64
Dataset: RDKit_pca
r2_train    0.736656
r2_test     0.177446
dtype: float64
Dataset: mordred_pca
r2_train    0.657696
r2_test     0.255134
dtype: float64
Dataset: MF
r2_train    0.949745
r2_test     0.213470
dtype: float64
Dataset: MK
r2_train    0.886483
r2_test     0.043307
dtype: float64
Dataset: RDKit
r2_train    0.706579
r2_test     0.278078
dtype: float64
Dataset: mordred
r2_train    0.775940
r2_test     0.338605
dtype: float64
