In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

In [2]:
def calculate_statistics(group):
    r2_test = group['r2_test']
    r2_test_dict = {f'run{i}': r2_test_val for i, r2_test_val in enumerate(r2_test)}
    return pd.Series({
        **r2_test_dict, 
        'r2_test_mean': np.mean(r2_test),
        'r2_test_max': np.max(r2_test),
        'r2_test_min': np.min(r2_test),
        'r2_test_std': np.std(r2_test, ddof=0),
    })

In [3]:
results = []
for i in range(10):
    for dataset in ['RDKit', 'mordred']:
        data = pd.read_csv(f'data/data_{dataset}.csv')
        y = pd.DataFrame(data['Yield'],columns=['Yield'])
        X = data.drop(columns=['Name', 'ID', 'Yield'])
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=i)
        param = {"n_estimators": [100, 1000, 5000], "max_depth": [3, 4, 5, 6]}
        reg = GridSearchCV(RandomForestRegressor(random_state=0), param_grid=param, cv=10, n_jobs=2)
        reg.fit(X_train,y_train['Yield'])
        best = reg.best_estimator_
        y_pred1 = best.predict(X_train)
        y_pred2 = best.predict(X_test)
        r2_test_score = metrics.r2_score(y_test, y_pred2)
        results.append({'dataset': dataset, 'r2_test': r2_test_score})
    
results_df = pd.DataFrame(results)
gen_results = results_df.groupby(['dataset']).apply(calculate_statistics).reset_index()

In [4]:
gen_results.T.to_csv('result/result_yield.csv', header=False)
gen_results.T

Unnamed: 0,0,1
dataset,RDKit,mordred
run0,0.543357,0.539028
run1,0.585081,0.314297
run2,0.620555,0.446005
run3,0.642284,0.649714
run4,0.671491,0.605843
run5,0.501225,0.477759
run6,0.632581,0.62051
run7,0.571623,0.597744
run8,0.476432,0.505303
