### Model Evaluation for LGB-Stack

In [1]:
import numpy as np
import pandas as pd
import time

# Model selection
from sklearn.model_selection import train_test_split

# metrics for regression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Regressors
import lightgbm as lgb

In [2]:
start = time.time()

# Original data without any features
data0 = pd.read_excel('Bandgap_chain4209.xlsx')
data0 = data0.sort_values(by=["bandgap_chain"], ascending=True, ignore_index=True)

# Datasets with features
data1 = pd.read_csv('Bandgap_avalon_256rfe.csv')
data1 = data1.sort_values(by=["bandgap_chain"], ascending=True, ignore_index=True)

data2 = pd.read_csv('Bandgap_avalonc_256rfe.csv')
data2 = data2.sort_values(by=["bandgap_chain"], ascending=True, ignore_index=True)

data3 = pd.read_csv('Bandgap_layered_512rfe.csv')
data3 = data3.sort_values(by=["bandgap_chain"], ascending=True, ignore_index=True)

data4 = pd.read_csv('Bandgap_rdkitfp_512rfe.csv')
data4 = data4.sort_values(by=["bandgap_chain"], ascending=True, ignore_index=True)

stop = time.time()
runtime = stop - start

print(f"Runtime: {runtime} s")
print("All dataset imported.")

Runtime: 1.0938231945037842 s
All dataset imported.


### Functions for stacking regression

From level 1 to level 2. Can also set the number of pseudo random permutations

In [3]:
def train_weak_model(size, permutation, dataX, feature): ### start with training the weak models
    # dataX = dataset
    
    # Level1 Holdout validation #####################################################
    training, testing = train_test_split(dataX, test_size=size, random_state=permutation)
    
    X_test = testing.iloc[:,2:].to_numpy()
    Y_test = testing['bandgap_chain'].to_numpy()
    
    X_train = training.iloc[:,2:].to_numpy()
    Y_train = training['bandgap_chain'].to_numpy()

    reg_ = lgb.LGBMRegressor()
    reg_.fit(X_train, Y_train)

    Y_train_pred = reg_.predict(X_train)
    Y_test_pred = reg_.predict(X_test)

    newtest = testing.iloc[:,:2].reset_index(drop=True)
    newtest[feature] = pd.Series(Y_test_pred)

    newtrain = training.iloc[:,:2].reset_index(drop=True)
    newtrain[feature] = pd.Series(Y_train_pred)

    newdata = pd.concat([newtrain, newtest], axis=0).reset_index(drop=True)
    newdata = newdata.sort_values(by=["bandgap_chain"], ascending=True, ignore_index=True)

    return newdata

In [4]:
def train_final_model(size, permutation): ### Training the final strong model

    # Get the weak models ###########################################
    newdata1 = train_weak_model(size, permutation, data1, 'Avalon_L')
    newdata2 = train_weak_model(size, permutation, data2, 'AvalonC_L')
    newdata3 = train_weak_model(size, permutation, data3, 'Layered_L')
    newdata4 = train_weak_model(size, permutation, data4, 'RDK_L')    
    
    
    newfinal = pd.concat([data0, newdata1.iloc[:,2],
                          newdata2.iloc[:,2],
                          newdata3.iloc[:,2],
                          newdata4.iloc[:,2],
                          ], axis=1).reset_index(drop=True)
    
    #print(newfinal.head())
    #print()
    # Train the final models
        
    finaltrain, finaltest = train_test_split(newfinal, test_size=size, random_state=permutation)
    
    final_X_train = finaltrain.iloc[:,2:].to_numpy()
    final_Y_train = finaltrain['bandgap_chain'].to_numpy()
    
    final_X_test = finaltest.iloc[:,2:].to_numpy()
    final_Y_test = finaltest['bandgap_chain'].to_numpy()

    regressor = lgb.LGBMRegressor()
    
    reg_ = regressor
    reg_.fit(final_X_train, final_Y_train)

    final_Y_train_pred = reg_.predict(final_X_train)
    final_RMSE_train = mean_squared_error(final_Y_train, final_Y_train_pred, squared = False)
    final_R2_train = r2_score(final_Y_train, final_Y_train_pred)

    final_Y_test_pred = reg_.predict(final_X_test)
    final_RMSE_test = mean_squared_error(final_Y_test, final_Y_test_pred, squared = False)
    final_R2_test = r2_score(final_Y_test, final_Y_test_pred)
    
    return final_RMSE_train, final_R2_train, final_RMSE_test, final_R2_test

In [5]:
# Test one instance

train_final_model(0.9, 47)

(0.2212272563570638,
 0.9774203619931507,
 0.6361552378732681,
 0.8056143120348453)

### Monte Carlo Cross-validation

In [6]:
def MCCV_scores(size):
    start = time.time()
    
    RMSEscores_train = []
    R2scores_train = []
    RMSEscores_test = []
    R2scores_test = []
    
    for p in range(50):    
        final_RMSE_train, final_R2_train, final_RMSE_test, final_R2_test = train_final_model(size, p)
        RMSEscores_train.append(final_RMSE_train)
        R2scores_train.append(final_R2_train)
        RMSEscores_test.append(final_RMSE_test)
        R2scores_test.append(final_R2_test)
    
    stop = time.time()
    runtime = stop - start
    
    print(f"Runtime: {runtime} s")
    print()
    print("MCCV aggregation results ###############################################################")
    print()
    print("Training set scores #########################################################################")
    best_rs_rmse_train = RMSEscores_train.index(np.min(RMSEscores_train)) # best random state for RMSE
    print("Best random state with lowest RMSE:", best_rs_rmse_train)
    print("Min RMSE:", np.min(RMSEscores_train))
    print("Mean RMSE:", np.mean(RMSEscores_train))
    print()
    best_rs_r2_train = R2scores_train.index(np.max(R2scores_train)) # best random state for R2
    print("Best random state with highest r2:", best_rs_r2_train)
    print("Max R2:", np.max(R2scores_train))
    print("Mean R2:", np.mean(R2scores_train))
    
    # Show RMSE and R2 for best random states
    print()
    print("Testing set scores ##########################################################################")
    best_rs_rmse_test = RMSEscores_test.index(np.min(RMSEscores_test)) # best random state for RMSE
    print("Best random state with lowest RMSE:", best_rs_rmse_test)
    print("Min RMSE:", np.min(RMSEscores_test))
    print("Mean RMSE:", np.mean(RMSEscores_test))
    print()
    best_rs_r2_test = R2scores_test.index(np.max(R2scores_test)) # best random state for R2
    print("Best random state with highest r2:", best_rs_r2_test)
    print("Max R2:", np.max(R2scores_test))
    print("Mean R2:", np.mean(R2scores_test))

    return None

### MCCV for the various train-test splits

In [7]:
# 10:90 split
MCCV_scores(0.9)

Runtime: 35.159157514572144 s

MCCV aggregation results ###############################################################

Training set scores #########################################################################
Best random state with lowest RMSE: 28
Min RMSE: 0.16265882470920795
Mean RMSE: 0.20883122948686558

Best random state with highest r2: 41
Max R2: 0.9873852586954673
Mean R2: 0.9788287482861532

Testing set scores ##########################################################################
Best random state with lowest RMSE: 32
Min RMSE: 0.6304789675127996
Mean RMSE: 0.6607105100439199

Best random state with highest r2: 32
Max R2: 0.8090624130550512
Mean R2: 0.7911082073907346


In [8]:
# 20:80 split
MCCV_scores(0.8)

Runtime: 57.496675968170166 s

MCCV aggregation results ###############################################################

Training set scores #########################################################################
Best random state with lowest RMSE: 24
Min RMSE: 0.13566493000013932
Mean RMSE: 0.16737690411843864

Best random state with highest r2: 24
Max R2: 0.9915242737021744
Mean R2: 0.9864696215805844

Testing set scores ##########################################################################
Best random state with lowest RMSE: 43
Min RMSE: 0.5537012875916546
Mean RMSE: 0.5878171070025864

Best random state with highest r2: 11
Max R2: 0.8496641733930128
Mean R2: 0.8346705119201421


In [9]:
# 30:70 split
MCCV_scores(0.7)

Runtime: 61.96960091590881 s

MCCV aggregation results ###############################################################

Training set scores #########################################################################
Best random state with lowest RMSE: 1
Min RMSE: 0.15704854116021041
Mean RMSE: 0.17623104026317202

Best random state with highest r2: 1
Max R2: 0.988491011960953
Mean R2: 0.9850431202079575

Testing set scores ##########################################################################
Best random state with lowest RMSE: 33
Min RMSE: 0.5110446397647029
Mean RMSE: 0.549465265587994

Best random state with highest r2: 33
Max R2: 0.8744140163807679
Mean R2: 0.8556829701007718


In [10]:
# 40:60 split
MCCV_scores(0.6)

Runtime: 64.67854356765747 s

MCCV aggregation results ###############################################################

Training set scores #########################################################################
Best random state with lowest RMSE: 22
Min RMSE: 0.1682876668472464
Mean RMSE: 0.1895469044516064

Best random state with highest r2: 1
Max R2: 0.985986310950362
Mean R2: 0.9827022505440516

Testing set scores ##########################################################################
Best random state with lowest RMSE: 43
Min RMSE: 0.494742496072873
Mean RMSE: 0.5218384083292849

Best random state with highest r2: 43
Max R2: 0.8825781017741147
Mean R2: 0.8700056997436338


In [11]:
# 50:50 split
MCCV_scores(0.5)

Runtime: 65.27960014343262 s

MCCV aggregation results ###############################################################

Training set scores #########################################################################
Best random state with lowest RMSE: 27
Min RMSE: 0.1911196751059381
Mean RMSE: 0.2026755998238081

Best random state with highest r2: 27
Max R2: 0.982059787021122
Mean R2: 0.9802254389938723

Testing set scores ##########################################################################
Best random state with lowest RMSE: 43
Min RMSE: 0.46651973116748663
Mean RMSE: 0.5049458351274374

Best random state with highest r2: 43
Max R2: 0.8940936478203195
Mean R2: 0.8785130102438746


In [12]:
# 60:40 split
MCCV_scores(0.4)

Runtime: 71.37469220161438 s

MCCV aggregation results ###############################################################

Training set scores #########################################################################
Best random state with lowest RMSE: 1
Min RMSE: 0.20006786166778312
Mean RMSE: 0.21188877276882223

Best random state with highest r2: 1
Max R2: 0.981049354711487
Mean R2: 0.9784302499759377

Testing set scores ##########################################################################
Best random state with lowest RMSE: 7
Min RMSE: 0.4354162003762759
Mean RMSE: 0.4869327077889779

Best random state with highest r2: 7
Max R2: 0.9078012029052973
Mean R2: 0.8869672091790097


In [13]:
# 70:30 split
MCCV_scores(0.3)

Runtime: 71.81651973724365 s

MCCV aggregation results ###############################################################

Training set scores #########################################################################
Best random state with lowest RMSE: 27
Min RMSE: 0.2115031562750035
Mean RMSE: 0.22207463762008955

Best random state with highest r2: 27
Max R2: 0.9782743322110415
Mean R2: 0.9763432495601747

Testing set scores ##########################################################################
Best random state with lowest RMSE: 7
Min RMSE: 0.41491083483385377
Mean RMSE: 0.4768835381349199

Best random state with highest r2: 7
Max R2: 0.9146644649310318
Mean R2: 0.8915112520210563


In [14]:
# 80:20 split
MCCV_scores(0.2)

Runtime: 72.69326329231262 s

MCCV aggregation results ###############################################################

Training set scores #########################################################################
Best random state with lowest RMSE: 27
Min RMSE: 0.22258128233212077
Mean RMSE: 0.22962904474515294

Best random state with highest r2: 27
Max R2: 0.9759681927656504
Mean R2: 0.9747328818732822

Testing set scores ##########################################################################
Best random state with lowest RMSE: 7
Min RMSE: 0.40809220969436377
Mean RMSE: 0.46690312706877746

Best random state with highest r2: 7
Max R2: 0.9196825826706232
Mean R2: 0.8958055200403375


In [15]:
# 90:10 split
MCCV_scores(0.1)

Runtime: 73.7156138420105 s

MCCV aggregation results ###############################################################

Training set scores #########################################################################
Best random state with lowest RMSE: 27
Min RMSE: 0.22947084952175004
Mean RMSE: 0.23674677145497738

Best random state with highest r2: 27
Max R2: 0.9745931297742445
Mean R2: 0.9731767368600528

Testing set scores ##########################################################################
Best random state with lowest RMSE: 23
Min RMSE: 0.40255659454380704
Mean RMSE: 0.4541223775250471

Best random state with highest r2: 15
Max R2: 0.921382785105989
Mean R2: 0.9008139618643811


In [16]:
# 95:5 split
MCCV_scores(0.05)

Runtime: 75.14671277999878 s

MCCV aggregation results ###############################################################

Training set scores #########################################################################
Best random state with lowest RMSE: 17
Min RMSE: 0.23546089981279641
Mean RMSE: 0.23908051530980512

Best random state with highest r2: 41
Max R2: 0.9735492825667182
Mean R2: 0.9726632706415779

Testing set scores ##########################################################################
Best random state with lowest RMSE: 31
Min RMSE: 0.3409982736373269
Mean RMSE: 0.44577332377256346

Best random state with highest r2: 47
Max R2: 0.9419952840085003
Mean R2: 0.9026957003421093
