### Model Evaluation for Individual sets (4096 bits RFE to 512 bits) 80/20 split

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time

# Model selection
from sklearn.model_selection import train_test_split

# metrics for regression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Gradient boosting regressor
import lightgbm as lgb

In [2]:
# Datasets with features
start = time.time()

data1 = pd.read_csv('Bandgap_avalon_512rfe.csv')
data1 = data1.sort_values(by=["bandgap_chain"], ascending=True, ignore_index=True)

data2 = pd.read_csv('Bandgap_avalonc_512rfe.csv')
data2 = data2.sort_values(by=["bandgap_chain"], ascending=True, ignore_index=True)

data3 = pd.read_csv('Bandgap_layered_512rfe.csv')
data3 = data3.sort_values(by=["bandgap_chain"], ascending=True, ignore_index=True)

data4 = pd.read_csv('Bandgap_rdkitfp_512rfe.csv')
data4 = data4.sort_values(by=["bandgap_chain"], ascending=True, ignore_index=True)

stop = time.time()
runtime = stop - start

print(f"Runtime: {runtime} s")
print("All dataset imported.")

Runtime: 0.8257904052734375 s
All dataset imported.


### Monte Carlo Cross-validation (MCCV) and Holdout function

In [3]:
def train_model(dataX, permutation):
    # dataX = dataset
    
    # Level1 Holdout validation #####################################################

    training, testing = train_test_split(dataX, test_size=0.2, random_state=permutation)
    
    X_test = testing.iloc[:,2:].to_numpy()
    Y_test = testing['bandgap_chain'].to_numpy()
    
    X_train = training.iloc[:,2:].to_numpy()
    Y_train = training['bandgap_chain'].to_numpy()

    reg_ = lgb.LGBMRegressor()
    reg_.fit(X_train, Y_train)

    Y_train_pred = reg_.predict(X_train)
    Y_test_pred = reg_.predict(X_test)

    Y_train_pred = reg_.predict(X_train)
    RMSE_train = mean_squared_error(Y_train, Y_train_pred, squared = False)
    R2_train = r2_score(Y_train, Y_train_pred)

    Y_test_pred = reg_.predict(X_test)
    RMSE_test = mean_squared_error(Y_test, Y_test_pred, squared = False)
    R2_test = r2_score(Y_test, Y_test_pred)
    
    return RMSE_train, R2_train, RMSE_test, R2_test

##########################################################################################################################

def MCCV_scores(dataX): # Do 50 runs
    start = time.time()
    
    RMSEscores_train = []
    R2scores_train = []
    RMSEscores_test = []
    R2scores_test = []
    
    for p in range(50):    
        RMSE_train, R2_train, RMSE_test, R2_test = train_model(dataX, p)
        RMSEscores_train.append(RMSE_train)
        R2scores_train.append(R2_train)
        RMSEscores_test.append(RMSE_test)
        R2scores_test.append(R2_test)
    
    stop = time.time()
    runtime = stop - start
    
    print(f"Runtime: {runtime} s")
    print()
    print("Training set scores #########################################################################")
    best_rs_rmse_train = RMSEscores_train.index(np.min(RMSEscores_train)) # best random state for RMSE
    print("Best random state with lowest RMSE:", best_rs_rmse_train)
    print("Min RMSE:", np.min(RMSEscores_train))
    print("Mean RMSE:", np.mean(RMSEscores_train))
    print("RMSE stdev:", np.std(RMSEscores_train))
    print()
    best_rs_r2_train = R2scores_train.index(np.max(R2scores_train)) # best random state for R2
    print("Best random state with highest r2:", best_rs_r2_train)
    print("Max R2:", np.max(R2scores_train))
    print("Mean R2:", np.mean(R2scores_train))
    
    # Show RMSE and R2 for best random states
    print()
    print("Testing set scores ##########################################################################")
    best_rs_rmse_test = RMSEscores_test.index(np.min(RMSEscores_test)) # best random state for RMSE
    print("Best random state with lowest RMSE:", best_rs_rmse_test)
    print("Min RMSE:", np.min(RMSEscores_test))
    print("Mean RMSE:", np.mean(RMSEscores_test))
    print("RMSE stdev:", np.std(RMSEscores_test))
    print()
    best_rs_r2_test = R2scores_test.index(np.max(R2scores_test)) # best random state for R2
    print("Best random state with highest r2:", best_rs_r2_test)
    print("Max R2:", np.max(R2scores_test))
    print("Mean R2:", np.mean(R2scores_test))

    return None

### MCCV for 50 permutations for LightGBM

In [4]:
MCCV_scores(data1)

Runtime: 19.521605968475342 s

Training set scores #########################################################################
Best random state with lowest RMSE: 29
Min RMSE: 0.3563245430730041
Mean RMSE: 0.368017614314352
RMSE stdev: 0.0057883790647658105

Best random state with highest r2: 29
Max R2: 0.9399479779532283
Mean R2: 0.9350946391267203

Testing set scores ##########################################################################
Best random state with lowest RMSE: 43
Min RMSE: 0.4600662768026542
Mean RMSE: 0.517270175543674
RMSE stdev: 0.03321894011493544

Best random state with highest r2: 43
Max R2: 0.8970867221948375
Mean R2: 0.8722084940216689


In [5]:
MCCV_scores(data2)

Runtime: 24.708958625793457 s

Training set scores #########################################################################
Best random state with lowest RMSE: 29
Min RMSE: 0.3316438282273176
Mean RMSE: 0.3478478320777969
RMSE stdev: 0.005466853722916689

Best random state with highest r2: 29
Max R2: 0.9479788437465139
Mean R2: 0.9420139893322984

Testing set scores ##########################################################################
Best random state with lowest RMSE: 9
Min RMSE: 0.4465447315135478
Mean RMSE: 0.5106759254935941
RMSE stdev: 0.03224274762432818

Best random state with highest r2: 9
Max R2: 0.9031114462578731
Mean R2: 0.875477738452224


In [6]:
MCCV_scores(data3)

Runtime: 25.198978185653687 s

Training set scores #########################################################################
Best random state with lowest RMSE: 12
Min RMSE: 0.3012628452541206
Mean RMSE: 0.3089010976671086
RMSE stdev: 0.0036327399170108

Best random state with highest r2: 29
Max R2: 0.9567577257746667
Mean R2: 0.9542747774681211

Testing set scores ##########################################################################
Best random state with lowest RMSE: 43
Min RMSE: 0.4243436254442217
Mean RMSE: 0.49014581362680937
RMSE stdev: 0.030416793494345517

Best random state with highest r2: 43
Max R2: 0.9124480193937358
Mean R2: 0.8852122001809374


In [7]:
MCCV_scores(data4)

Runtime: 20.677393436431885 s

Training set scores #########################################################################
Best random state with lowest RMSE: 38
Min RMSE: 0.2825162094090695
Mean RMSE: 0.2942806155250561
RMSE stdev: 0.0038556490063728463

Best random state with highest r2: 38
Max R2: 0.9617137397055288
Mean R2: 0.9585002624460011

Testing set scores ##########################################################################
Best random state with lowest RMSE: 43
Min RMSE: 0.43460585896808324
Mean RMSE: 0.4870133751985678
RMSE stdev: 0.029762038518670073

Best random state with highest r2: 43
Max R2: 0.9081621382149887
Mean R2: 0.8866936752683006
