In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from sklearn import metrics

In [2]:
for dataset in ['DFT', 'MF_pca', 'MK_pca', 'RDKit_pca', 'mordred_pca', 'MF', 'MK', 'RDKit', 'mordred']:
    data = pd.read_csv(f'data/data_{dataset}.csv')
    y = pd.DataFrame(data['Yield'], columns=['Yield'])
    X = data.drop(columns=['Name', 'ID', 'Yield'])
    
    data_s = pd.read_csv(f'data/data_{dataset}_ALL.csv')
    y_s = pd.DataFrame(data_s['Yield'], columns=['Yield'])
    X_s = data_s.drop(columns=['Name', 'ID', 'Yield'])


    r2_train = []
    r2_test = []
    for i in range(100):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=i)
        param = {"n_estimators": [100, 500, 1000], "gamma": [0, 1, 2], "min_child_weight": [1, 2, 5, 10],
                 "subsample": [0.6, 1], "eta": [0.01, 0.1, 0.3, 1], "colsample_bytree": [0.6, 1],
                 "lambda": [0.01, 0.1, 1], "max_depth": [5, 7, 9]}
        reg = RandomizedSearchCV(XGBRegressor(random_state=0), param_distributions=param, n_iter=600, cv=5, random_state=0, n_jobs=32)
        reg.fit(X_s, y_s['Yield'])
        best = reg.best_estimator_
        y_pred1 = best.predict(X_s)
        y_pred2 = best.predict(X_test)

        r2_train.append(metrics.r2_score(y_s, y_pred1))
        r2_test.append(metrics.r2_score(y_test, y_pred2))

    r2_train_df = pd.DataFrame(data=r2_train, columns=['r2_train'])
    r2_test_df = pd.DataFrame(data=r2_test, columns=['r2_test'])
    result = pd.concat([r2_train_df, r2_test_df], axis=1, join='inner')
    result.to_csv(f'result/result_{dataset}.csv', index=False)



In [3]:
for re in['DFT', 'MF_pca', 'MK_pca', 'RDKit_pca', 'mordred_pca', 'MF', 'MK', 'RDKit', 'mordred']:
    res = pd.read_csv(f'result/result_{re}.csv')
    print(f'Dataset: {re}')
    print(res.mean())
    print('=======================')

Dataset: DFT
r2_train    0.993839
r2_test     0.349557
dtype: float64
Dataset: MF_pca
r2_train    0.609131
r2_test     0.408054
dtype: float64
Dataset: MK_pca
r2_train    0.836177
r2_test     0.282233
dtype: float64
Dataset: RDKit_pca
r2_train    0.999001
r2_test     0.334798
dtype: float64
Dataset: mordred_pca
r2_train    0.92064
r2_test     0.40478
dtype: float64
Dataset: MF
r2_train    0.957136
r2_test     0.335390
dtype: float64
Dataset: MK
r2_train    0.792720
r2_test     0.257441
dtype: float64
Dataset: RDKit
r2_train    0.986230
r2_test     0.377292
dtype: float64
Dataset: mordred
r2_train    0.997813
r2_test     0.320117
dtype: float64
