### Model Evaluation for Individual sets (4096 bits RFE to 256 bits) 80/20 split

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time

# Model selection
from sklearn.model_selection import train_test_split

# metrics for regression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Gradient boosting regressor
import lightgbm as lgb

In [2]:
# Datasets with features
start = time.time()

data1 = pd.read_csv('Bandgap_avalon_256rfe.csv')
data1 = data1.sort_values(by=["bandgap_chain"], ascending=True, ignore_index=True)

data2 = pd.read_csv('Bandgap_avalonc_256rfe.csv')
data2 = data2.sort_values(by=["bandgap_chain"], ascending=True, ignore_index=True)

data3 = pd.read_csv('Bandgap_layered_256rfe.csv')
data3 = data3.sort_values(by=["bandgap_chain"], ascending=True, ignore_index=True)

data4 = pd.read_csv('Bandgap_rdkitfp_256rfe.csv')
data4 = data4.sort_values(by=["bandgap_chain"], ascending=True, ignore_index=True)

stop = time.time()
runtime = stop - start

print(f"Runtime: {runtime} s")
print("All dataset imported.")

Runtime: 0.3054981231689453 s
All dataset imported.


### Monte Carlo Cross-validation (MCCV) and Holdout function

In [3]:
def train_model(dataX, permutation):
    # dataX = dataset
    
    # Level1 Holdout validation #####################################################

    training, testing = train_test_split(dataX, test_size=0.2, random_state=permutation)
    
    X_test = testing.iloc[:,2:].to_numpy()
    Y_test = testing['bandgap_chain'].to_numpy()
    
    X_train = training.iloc[:,2:].to_numpy()
    Y_train = training['bandgap_chain'].to_numpy()

    reg_ = lgb.LGBMRegressor()
    reg_.fit(X_train, Y_train)

    Y_train_pred = reg_.predict(X_train)
    Y_test_pred = reg_.predict(X_test)

    Y_train_pred = reg_.predict(X_train)
    RMSE_train = mean_squared_error(Y_train, Y_train_pred, squared = False)
    R2_train = r2_score(Y_train, Y_train_pred)

    Y_test_pred = reg_.predict(X_test)
    RMSE_test = mean_squared_error(Y_test, Y_test_pred, squared = False)
    R2_test = r2_score(Y_test, Y_test_pred)
    
    return RMSE_train, R2_train, RMSE_test, R2_test

##########################################################################################################################

def MCCV_scores(dataX): # Do 50 runs
    start = time.time()
    
    RMSEscores_train = []
    R2scores_train = []
    RMSEscores_test = []
    R2scores_test = []
    
    for p in range(50):    
        RMSE_train, R2_train, RMSE_test, R2_test = train_model(dataX, p)
        RMSEscores_train.append(RMSE_train)
        R2scores_train.append(R2_train)
        RMSEscores_test.append(RMSE_test)
        R2scores_test.append(R2_test)
    
    stop = time.time()
    runtime = stop - start
    
    print(f"Runtime: {runtime} s")
    print()
    print("Training set scores #########################################################################")
    best_rs_rmse_train = RMSEscores_train.index(np.min(RMSEscores_train)) # best random state for RMSE
    print("Best random state with lowest RMSE:", best_rs_rmse_train)
    print("Min RMSE:", np.min(RMSEscores_train))
    print("Mean RMSE:", np.mean(RMSEscores_train))
    print("RMSE stdev:", np.std(RMSEscores_train))
    print()
    best_rs_r2_train = R2scores_train.index(np.max(R2scores_train)) # best random state for R2
    print("Best random state with highest r2:", best_rs_r2_train)
    print("Max R2:", np.max(R2scores_train))
    print("Mean R2:", np.mean(R2scores_train))
    
    # Show RMSE and R2 for best random states
    print()
    print("Testing set scores ##########################################################################")
    best_rs_rmse_test = RMSEscores_test.index(np.min(RMSEscores_test)) # best random state for RMSE
    print("Best random state with lowest RMSE:", best_rs_rmse_test)
    print("Min RMSE:", np.min(RMSEscores_test))
    print("Mean RMSE:", np.mean(RMSEscores_test))
    print("RMSE stdev:", np.std(RMSEscores_test))
    print()
    best_rs_r2_test = R2scores_test.index(np.max(R2scores_test)) # best random state for R2
    print("Best random state with highest r2:", best_rs_r2_test)
    print("Max R2:", np.max(R2scores_test))
    print("Mean R2:", np.mean(R2scores_test))

    return None

### MCCV for 50 permutations for LightGBM

In [4]:
MCCV_scores(data1)

Runtime: 14.167711734771729 s

Training set scores #########################################################################
Best random state with lowest RMSE: 35
Min RMSE: 0.36225769428582516
Mean RMSE: 0.37339198506425647
RMSE stdev: 0.005306061725014264

Best random state with highest r2: 29
Max R2: 0.9371967293006017
Mean R2: 0.9331872611398516

Testing set scores ##########################################################################
Best random state with lowest RMSE: 43
Min RMSE: 0.45216003121494386
Mean RMSE: 0.5115246639230605
RMSE stdev: 0.0339780190636956

Best random state with highest r2: 4
Max R2: 0.9007878010109404
Mean R2: 0.8750144941760218


In [5]:
MCCV_scores(data2)

Runtime: 18.3161358833313 s

Training set scores #########################################################################
Best random state with lowest RMSE: 29
Min RMSE: 0.334678524996724
Mean RMSE: 0.3518737859079948
RMSE stdev: 0.005203482301147933

Best random state with highest r2: 29
Max R2: 0.947022451940111
Mean R2: 0.9406643502983353

Testing set scores ##########################################################################
Best random state with lowest RMSE: 9
Min RMSE: 0.44627097542309985
Mean RMSE: 0.5050118200168405
RMSE stdev: 0.031526995691705056

Best random state with highest r2: 9
Max R2: 0.9032302056991479
Mean R2: 0.8782148292703247


In [6]:
MCCV_scores(data3)

Runtime: 13.639581441879272 s

Training set scores #########################################################################
Best random state with lowest RMSE: 29
Min RMSE: 0.31096285081109787
Mean RMSE: 0.3198195483635803
RMSE stdev: 0.003643123504764943

Best random state with highest r2: 29
Max R2: 0.9542645264891039
Mean R2: 0.9509864231259225

Testing set scores ##########################################################################
Best random state with lowest RMSE: 43
Min RMSE: 0.43701691187129443
Mean RMSE: 0.4851016359672559
RMSE stdev: 0.029946418968051173

Best random state with highest r2: 43
Max R2: 0.9071403381701906
Mean R2: 0.8875862017107715


In [7]:
MCCV_scores(data4)

Runtime: 13.686189889907837 s

Training set scores #########################################################################
Best random state with lowest RMSE: 38
Min RMSE: 0.29579682104665556
Mean RMSE: 0.3086371834442329
RMSE stdev: 0.004056261554455113

Best random state with highest r2: 38
Max R2: 0.958029590069773
Mean R2: 0.9543516443923338

Testing set scores ##########################################################################
Best random state with lowest RMSE: 43
Min RMSE: 0.43653568939064485
Mean RMSE: 0.4863023822519655
RMSE stdev: 0.029066673659192733

Best random state with highest r2: 43
Max R2: 0.9073447309634997
Mean R2: 0.8870672244042832
