In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

In [2]:
data = pd.read_csv(f'data/data_DFT.csv')
y = pd.DataFrame(data['Yield'],columns=['Yield'])
X = data.drop(columns=['Name', 'ID', 'Yield'])

In [3]:
# train:test=6:4

r2_test = []
for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=i)
    param = {"n_estimators": [100, 1000, 5000], "max_depth": [3, 4, 5, 6]}
    reg = GridSearchCV(RandomForestRegressor(random_state=0), param_grid=param, cv=10, n_jobs=16)
    reg.fit(X_train,y_train['Yield'])
    best = reg.best_estimator_
    y_pred2 = best.predict(X_test)
    r2_test.append(metrics.r2_score(y_test, y_pred2))
    
print('Avg_R2_test:', np.mean(r2_test))
print('Max_R2_test:', np.max(r2_test))
print('Std_R2_test:', np.std(r2_test, ddof=0))

Avg_R2_test: 0.3076259944534349
Max_R2_test: 0.5851672935781427
Std_R2_test: 0.17513041364560344


In [4]:
# train:test=7:3

r2_test = []
for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=i)
    param = {"n_estimators": [100, 1000, 5000], "max_depth": [3, 4, 5, 6]}
    reg = GridSearchCV(RandomForestRegressor(random_state=0), param_grid=param, cv=10, n_jobs=16)
    reg.fit(X_train,y_train['Yield'])
    best = reg.best_estimator_
    y_pred2 = best.predict(X_test)
    r2_test.append(metrics.r2_score(y_test, y_pred2))
    
print('Avg_R2_test:', np.mean(r2_test))
print('Max_R2_test:', np.max(r2_test))
print('Std_R2_test:', np.std(r2_test, ddof=0))

Avg_R2_test: 0.3717453635903884
Max_R2_test: 0.694173675798827
Std_R2_test: 0.18875779187310038


In [5]:
# train:test=8:2

r2_test = []
for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    param = {"n_estimators": [100, 1000, 5000], "max_depth": [3, 4, 5, 6]}
    reg = GridSearchCV(RandomForestRegressor(random_state=0), param_grid=param, cv=10, n_jobs=16)
    reg.fit(X_train,y_train['Yield'])
    best = reg.best_estimator_
    y_pred2 = best.predict(X_test)
    r2_test.append(metrics.r2_score(y_test, y_pred2))
    
print('Avg_R2_test:', np.mean(r2_test))
print('Max_R2_test:', np.max(r2_test))
print('Std_R2_test:', np.std(r2_test, ddof=0))

Avg_R2_test: 0.2797748301031768
Max_R2_test: 0.806585985714256
Std_R2_test: 1.0505789322393424
