### Model Evaluation for Individual sets (4096 bits) 80/20 split

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time

# Model selection
from sklearn.model_selection import train_test_split

# metrics for regression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Gradient boosting regressor
import lightgbm as lgb

In [2]:
# Datasets with features
start = time.time()

data1 = pd.read_csv('Bandgap_avalon_4096.csv')
data1 = data1.sort_values(by=["bandgap_chain"], ascending=True, ignore_index=True)

data2 = pd.read_csv('Bandgap_avalonc_4096.csv')
data2 = data2.sort_values(by=["bandgap_chain"], ascending=True, ignore_index=True)

data3 = pd.read_csv('Bandgap_layered_4096.csv')
data3 = data3.sort_values(by=["bandgap_chain"], ascending=True, ignore_index=True)

data4 = pd.read_csv('Bandgap_rdkitfp_4096.csv')
data4 = data4.sort_values(by=["bandgap_chain"], ascending=True, ignore_index=True)

stop = time.time()
runtime = stop - start

print(f"Runtime: {runtime} s")
print("All dataset imported.")

Runtime: 9.096773624420166 s
All dataset imported.


### Monte Carlo Cross-validation (MCCV) and Holdout function

https://en.wikipedia.org/wiki/Cross-validation_(statistics)#Holdout_method

In [3]:
def train_model(dataX, permutation):
    # dataX = dataset
    
    # Level1 Holdout validation #####################################################

    training, testing = train_test_split(dataX, test_size=0.2, random_state=permutation)
    
    X_test = testing.iloc[:,2:].to_numpy()
    Y_test = testing['bandgap_chain'].to_numpy()
    
    X_train = training.iloc[:,2:].to_numpy()
    Y_train = training['bandgap_chain'].to_numpy()

    reg_ = lgb.LGBMRegressor()
    reg_.fit(X_train, Y_train)

    Y_train_pred = reg_.predict(X_train)
    Y_test_pred = reg_.predict(X_test)

    Y_train_pred = reg_.predict(X_train)
    RMSE_train = mean_squared_error(Y_train, Y_train_pred, squared = False)
    R2_train = r2_score(Y_train, Y_train_pred)

    Y_test_pred = reg_.predict(X_test)
    RMSE_test = mean_squared_error(Y_test, Y_test_pred, squared = False)
    R2_test = r2_score(Y_test, Y_test_pred)
    
    return RMSE_train, R2_train, RMSE_test, R2_test

##########################################################################################################################

def MCCV_scores(dataX): # Do 50 runs
    start = time.time()
    
    RMSEscores_train = []
    R2scores_train = []
    RMSEscores_test = []
    R2scores_test = []
    
    for p in range(50):    
        RMSE_train, R2_train, RMSE_test, R2_test = train_model(dataX, p)
        RMSEscores_train.append(RMSE_train)
        R2scores_train.append(R2_train)
        RMSEscores_test.append(RMSE_test)
        R2scores_test.append(R2_test)
    
    stop = time.time()
    runtime = stop - start
    
    print(f"Runtime: {runtime} s")
    print()
    print("Training set scores #########################################################################")
    best_rs_rmse_train = RMSEscores_train.index(np.min(RMSEscores_train)) # best random state for RMSE
    print("Best random state with lowest RMSE:", best_rs_rmse_train)
    print("Min RMSE:", np.min(RMSEscores_train))
    print("Mean RMSE:", np.mean(RMSEscores_train))
    print()
    best_rs_r2_train = R2scores_train.index(np.max(R2scores_train)) # best random state for R2
    print("Best random state with highest r2:", best_rs_r2_train)
    print("Max R2:", np.max(R2scores_train))
    print("Mean R2:", np.mean(R2scores_train))
    
    # Show RMSE and R2 for best random states
    print()
    print("Testing set scores ##########################################################################")
    best_rs_rmse_test = RMSEscores_test.index(np.min(RMSEscores_test)) # best random state for RMSE
    print("Best random state with lowest RMSE:", best_rs_rmse_test)
    print("Min RMSE:", np.min(RMSEscores_test))
    print("Mean RMSE:", np.mean(RMSEscores_test))
    print()
    best_rs_r2_test = R2scores_test.index(np.max(R2scores_test)) # best random state for R2
    print("Best random state with highest r2:", best_rs_r2_test)
    print("Max R2:", np.max(R2scores_test))
    print("Mean R2:", np.mean(R2scores_test))

    return None

### MCCV for 50 permutations for LightGBM

In [4]:
MCCV_scores(data1)

Runtime: 75.32016015052795 s

Training set scores #########################################################################
Best random state with lowest RMSE: 41
Min RMSE: 0.3544269242692659
Mean RMSE: 0.3657090435160588

Best random state with highest r2: 29
Max R2: 0.9404425880989973
Mean R2: 0.9359067236857642

Testing set scores ##########################################################################
Best random state with lowest RMSE: 9
Min RMSE: 0.45800221085854115
Mean RMSE: 0.5232418478043187

Best random state with highest r2: 9
Max R2: 0.8980757128948389
Mean R2: 0.8692567552567947


In [5]:
MCCV_scores(data2)

Runtime: 87.11384987831116 s

Training set scores #########################################################################
Best random state with lowest RMSE: 29
Min RMSE: 0.33118163127745476
Mean RMSE: 0.3477666546547582

Best random state with highest r2: 29
Max R2: 0.9481237417556079
Mean R2: 0.9420404384032589

Testing set scores ##########################################################################
Best random state with lowest RMSE: 9
Min RMSE: 0.4467044587326493
Mean RMSE: 0.5142894102825067

Best random state with highest r2: 9
Max R2: 0.9030421205847037
Mean R2: 0.873698969790941


In [6]:
MCCV_scores(data3)

Runtime: 103.20585799217224 s

Training set scores #########################################################################
Best random state with lowest RMSE: 18
Min RMSE: 0.2918767440921636
Mean RMSE: 0.29956470695314635

Best random state with highest r2: 29
Max R2: 0.9591091412219314
Mean R2: 0.9569969479771063

Testing set scores ##########################################################################
Best random state with lowest RMSE: 43
Min RMSE: 0.4401156628362375
Mean RMSE: 0.5055746936179998

Best random state with highest r2: 43
Max R2: 0.9058187914379847
Mean R2: 0.8778840038037424


In [7]:
MCCV_scores(data4)

Runtime: 124.42845559120178 s

Training set scores #########################################################################
Best random state with lowest RMSE: 38
Min RMSE: 0.26891265863741165
Mean RMSE: 0.279189855829641

Best random state with highest r2: 29
Max R2: 0.9656057374486416
Mean R2: 0.9626456957050066

Testing set scores ##########################################################################
Best random state with lowest RMSE: 43
Min RMSE: 0.4514898349912358
Mean RMSE: 0.5112170016247719

Best random state with highest r2: 43
Max R2: 0.9008879260652289
Mean R2: 0.8751721762321831
