In [17]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import seaborn as sns

import rdkit
from rdkit import Chem, DataStructs
from rdkit.Chem import Draw, rdmolops, AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import MACCSkeys


from scipy.stats import pearsonr


# sklearn ML models
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import svm

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import  LGBMRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

from sklearn.model_selection import KFold

import joblib

# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity='all'


# exp gap

In [18]:
df=pd.read_csv('../raw/conjugated-polymer-1096dp.csv')
df

Unnamed: 0,idx,Nickname,HOMO(eV),LUMO(eV),bandgap(eV),Ref.No,smiles_monomer,smiles_oligomer
0,1,P3HT,-5.20,-3.27,1.93,S10,CCCCCCc1cc(C)sc1C,Cc1cc(C)c(s1)-c1cc(C)c(s1)-c1cc(C)c(s1)-c1cc(C...
1,2,P3HST,-4.90,-3.08,1.82,S123,CCCCCCSc1cc(C)sc1C,CSc1cc(sc1C)-c1sc(cc1SC)-c1sc(cc1SC)-c1sc(cc1S...
2,3,POPT,-5.34,-3.58,1.76,S126,CCCCCCCCc1ccc(-c2cc(C)sc2C)cc1,Cc1cc(c(s1)-c1cc(c(s1)-c1cc(c(s1)-c1cc(c(s1)-c...
3,4,PT-C1,-5.15,-3.23,1.92,S122,CCCCC(CC)COC(=O)c1cc(C)sc1-c1ccc(C)s1,COC(=O)c1cc(C)sc1-c1ccc(s1)-c1cc(C(=O)OC)c(s1)...
4,5,PT-C2,-5.11,-3.22,1.89,S122,CCCCC(CC)COC(=O)c1cc(C)sc1-c1ccc(-c2ccc(C)s2)s1,COC(=O)c1cc(C)sc1-c1ccc(s1)-c1ccc(s1)-c1cc(C(=...
...,...,...,...,...,...,...,...,...
1091,1092,BTT-NTz,-5.41,-3.73,1.68,S115,CCCCCCCCCCCCc1cc(C)sc1-c1cc2c3sc(C(CCCCCCCC)CC...,Cc1cc(C)c(s1)-c1cc2c3sc(C)cc3c3cc(sc3c2s1)-c1s...
1092,1093,PIDTI-BT,-5.30,-3.65,1.65,S441,CCCCCCCCCCCCC(CCCCCCCCCC)Cn1c2cc3c(cc2c2sc(C)c...,Cc1cc2n(C)c3cc4-c5cc6c(cc5C(c4cc3c2s1)(c1ccccc...
1093,1094,PIDTI-DTBT,-5.26,-3.60,1.66,S441,CCCCCCCCCCCCC(CCCCCCCCCC)Cn1c2cc3c(cc2c2sc(C)c...,Cc1ccc(s1)-c1ccc(-c2ccc(s2)-c2cc3n(C)c4cc5-c6c...
1094,1095,poly(DPP4T-alt-TBP),-5.42,-3.90,1.52,S464,CCCCCCCCCCCCC(CCCCCCCCCC)CN1C(=O)C2=C(c3ccc(-c...,CN1C(=O)C2=C(N(C)C(=O)C2=C1c1ccc(C)s1)c1ccc(s1...


In [19]:
df_exp = df['bandgap(eV)']
df_exp

0       1.93
1       1.82
2       1.76
3       1.92
4       1.89
        ... 
1091    1.68
1092    1.65
1093    1.66
1094    1.52
1095    1.73
Name: bandgap(eV), Length: 1096, dtype: float64

# DFT gap

In [20]:
df = pd.read_csv('../raw/oligomer-DFT-TDDFT-calculations-1096dp.csv')
df


Unnamed: 0,idx,HOMO-B3LYP(eV),LUMO-B3LYP(eV),HOMO-LUMO-B3LYP(eV),HOMO-wB97XD(eV),LUMO-wB97XD(eV),HOMO-LUMO-wB97XD(eV),HOMO-camB3LYP(eV),LUMO-camB3LYP(eV),HOMO-LUMO-camB3LYP(eV),HOMO-PBE(eV),LUMO-PBE(eV),HOMO-LUMO-PBE(eV),S1-state-B3LYP(eV)
0,1,-4.58593,-1.79867,2.78726,-6.33971,-0.268032,6.07168,-5.77398,-0.790218,4.98376,-4.03082,-2.26780,1.76303,2.4954
1,2,-4.99329,-2.27297,2.72032,-6.76774,-0.755116,6.01263,-6.19766,-1.277570,4.92009,-4.42729,-2.73937,1.68792,2.3866
2,3,-4.97206,-1.77228,3.19979,-6.78652,-0.174153,6.61237,-6.20828,-0.708040,5.50024,-4.39545,-2.31188,2.08358,2.7435
3,4,-4.90784,-2.20820,2.69964,-6.65345,-0.674026,5.97943,-6.08392,-1.195940,4.88798,-4.34947,-2.66644,1.68302,2.3922
4,5,-4.83002,-2.16630,2.66372,-6.57481,-0.636202,5.93861,-6.00637,-1.157570,4.84880,-4.28117,-2.62617,1.65500,2.3712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1091,1092,-4.80172,-2.87134,1.93038,-6.51413,-1.471590,5.04254,-5.94487,-1.973640,3.97123,-4.28797,-3.26373,1.02424,1.6818
1092,1093,-4.52852,-2.59787,1.93065,-6.21154,-1.178520,5.03302,-5.63765,-1.684110,3.95354,-4.05967,-3.02210,1.03757,1.6425
1093,1094,-4.61206,-2.56767,2.04439,-6.31277,-1.099340,5.21343,-5.74024,-1.616900,4.12334,-4.12225,-3.00631,1.11594,1.8037
1094,1095,-4.69043,-2.74481,1.94561,-6.37046,-1.334990,5.03547,-5.79820,-1.834590,3.96361,-4.17994,-3.13611,1.04383,1.7335


In [21]:
df_dft= df[['HOMO-LUMO-B3LYP(eV)']]
df_dft = df_dft.rename(columns={'HOMO-LUMO-B3LYP(eV)':'HOMO-LUMO(eV)'})
df_dft

Unnamed: 0,HOMO-LUMO(eV)
0,2.78726
1,2.72032
2,3.19979
3,2.69964
4,2.66372
...,...
1091,1.93038
1092,1.93065
1093,2.04439
1094,1.94561


# RDKit-209

In [22]:
df_rdkit_screen = pd.read_csv('../raw/monomer-rdkit-54features-1096dp.csv')
df_rdkit_screen = df_rdkit_screen.iloc[:,1:]
df_rdkit_screen

Unnamed: 0,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,FpDensityMorgan1,BalabanJ,PEOE_VSA1,PEOE_VSA11,PEOE_VSA12,...,NumAromaticHeterocycles,NumHAcceptors,fr_C_O,fr_aniline,fr_bicyclic,fr_halogen,fr_imide,fr_ketone_Topliss,fr_oxazole,fr_pyridine
0,2.353121,1.287870,1.287870,0.608740,196.359,1.153846,2.318302,0.000000,0.0,0.0,...,1,1,0,0,0,0,0,0,0,0
1,2.323896,1.288704,1.288704,0.485488,228.426,1.214286,2.209145,0.000000,0.0,0.0,...,1,2,0,0,0,0,0,0,0,0
2,2.313310,1.230210,1.230210,0.463323,300.511,0.857143,1.833656,0.000000,0.0,0.0,...,1,1,0,0,0,0,0,0,0,0
3,12.542211,0.175600,-0.175600,0.510163,350.549,1.173913,2.074957,4.736863,0.0,0.0,...,2,4,1,0,0,0,0,0,0,0
4,12.817589,0.184059,-0.184059,0.318494,432.676,0.964286,1.756705,4.736863,0.0,0.0,...,3,5,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1091,5.448759,0.613257,0.613257,0.035659,2093.634,0.211268,0.950806,0.000000,0.0,0.0,...,9,13,0,0,11,0,0,0,0,0
1092,6.964899,0.506884,-0.861916,0.035381,2097.349,0.245033,0.981006,28.081651,0.0,0.0,...,5,11,0,0,13,0,0,0,0,0
1093,7.004105,0.497358,-0.874785,0.035381,2261.603,0.223602,0.904925,28.081651,0.0,0.0,...,7,13,0,0,13,0,0,0,0,0
1094,17.056452,0.015571,0.015571,0.035580,2289.808,0.162500,0.900316,19.599639,0.0,0.0,...,6,10,4,0,2,0,0,0,0,0


# MorganFP-1024

In [23]:
Morgan_fingerprints = pd.read_csv('../raw/monomer-ECFP6-1024keys-1096dp.csv')
Morgan_fingerprints = Morgan_fingerprints.iloc[:,1:]
Morgan_fingerprints

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_1014,f_1015,f_1016,f_1017,f_1018,f_1019,f_1020,f_1021,f_1022,f_1023
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1091,0,1,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
1092,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1093,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1094,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


# MACCS-167

In [24]:
MACCS_fingerprints = pd.read_csv('../raw/monomer-MACCS-167keys-1096dp.csv')
MACCS_fingerprints = MACCS_fingerprints.iloc[:,1:]
MACCS_fingerprints


Unnamed: 0,m_0,m_1,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9,...,m_157,m_158,m_159,m_160,m_161,m_162,m_163,m_164,m_165,m_166
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,1,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,1,0,1,1,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1091,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,1,1,0,1,0
1092,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
1093,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
1094,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0


# Model loading

In [25]:
def acc(y_test,y_pred):
    MSE = mean_squared_error(y_test,y_pred)
    RMSE = MSE ** 0.5
    R2 = r2_score(y_test,y_pred)
    r, p_value = pearsonr(y_test,y_pred)
    MAE = mean_absolute_error(y_test,y_pred)
    return RMSE, R2, r, MAE

In [26]:
y = df_exp
y

0       1.93
1       1.82
2       1.76
3       1.92
4       1.89
        ... 
1091    1.68
1092    1.65
1093    1.66
1094    1.52
1095    1.73
Name: bandgap(eV), Length: 1096, dtype: float64

In [27]:
# X = pd.concat([df_rdkit_screen, Morgan_fingerprints, MACCS_fingerprints, df_dft], axis = 1)
# X = pd.concat([Morgan_fingerprints, MACCS_fingerprints, df_dft], axis = 1)
# X = pd.concat([df_rdkit_screen, MACCS_fingerprints, df_dft], axis = 1)
# X = pd.concat([df_rdkit_screen, Morgan_fingerprints, df_dft], axis = 1)
# X = pd.concat([Morgan_fingerprints, df_dft], axis = 1)
# X = pd.concat([MACCS_fingerprints, df_dft], axis = 1)
X = pd.concat([df_rdkit_screen, df_dft], axis = 1)
X

Unnamed: 0,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,FpDensityMorgan1,BalabanJ,PEOE_VSA1,PEOE_VSA11,PEOE_VSA12,...,NumHAcceptors,fr_C_O,fr_aniline,fr_bicyclic,fr_halogen,fr_imide,fr_ketone_Topliss,fr_oxazole,fr_pyridine,HOMO-LUMO(eV)
0,2.353121,1.287870,1.287870,0.608740,196.359,1.153846,2.318302,0.000000,0.0,0.0,...,1,0,0,0,0,0,0,0,0,2.78726
1,2.323896,1.288704,1.288704,0.485488,228.426,1.214286,2.209145,0.000000,0.0,0.0,...,2,0,0,0,0,0,0,0,0,2.72032
2,2.313310,1.230210,1.230210,0.463323,300.511,0.857143,1.833656,0.000000,0.0,0.0,...,1,0,0,0,0,0,0,0,0,3.19979
3,12.542211,0.175600,-0.175600,0.510163,350.549,1.173913,2.074957,4.736863,0.0,0.0,...,4,1,0,0,0,0,0,0,0,2.69964
4,12.817589,0.184059,-0.184059,0.318494,432.676,0.964286,1.756705,4.736863,0.0,0.0,...,5,1,0,0,0,0,0,0,0,2.66372
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1091,5.448759,0.613257,0.613257,0.035659,2093.634,0.211268,0.950806,0.000000,0.0,0.0,...,13,0,0,11,0,0,0,0,0,1.93038
1092,6.964899,0.506884,-0.861916,0.035381,2097.349,0.245033,0.981006,28.081651,0.0,0.0,...,11,0,0,13,0,0,0,0,0,1.93065
1093,7.004105,0.497358,-0.874785,0.035381,2261.603,0.223602,0.904925,28.081651,0.0,0.0,...,13,0,0,13,0,0,0,0,0,2.04439
1094,17.056452,0.015571,0.015571,0.035580,2289.808,0.162500,0.900316,19.599639,0.0,0.0,...,10,4,0,2,0,0,0,0,0,1.94561


In [28]:
models = {
    'hgbr': HistGradientBoostingRegressor(),
    'lgbm': LGBMRegressor(force_col_wise=True, verbose=-1),
    'gbr': GradientBoostingRegressor(),
    'xgb': XGBRegressor(),
    'adaboost': AdaBoostRegressor(),
    'rf': RandomForestRegressor()
}

In [29]:
folder = '6Model-DFT-RDKit'
model_name='xgb'
foldername = model_name+'-10fold-10subfold'

In [30]:
xfold=10

models = []

for fold_idx in range(xfold):
    for fold_idx_sub in range(xfold):
        model_filename = folder+'/'+foldername + f'/model_fold_{fold_idx + 1}_subfold_{fold_idx_sub + 1}.pkl'
        model = joblib.load(model_filename)
        models.append(model)

len(models)

100

In [31]:
xfold=10
kf = KFold(n_splits=xfold, shuffle=True, random_state=42)

# save index for train and test of each fold
train_idx_list = []
test_idx_list = []
for fold_idx, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    train_idx_list.append(train_index)
    test_idx_list.append(test_index)

print("Start predicting...")
scores = []

for i in range(xfold):
    train_index = train_idx_list[i]
    test_index = test_idx_list[i]

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index] 

    predictions = []

    for j in range(xfold):
        model = models[i*xfold+j]     

        y_pred = model.predict(X_test)
        predictions.append(y_pred)

    df_predictions = pd.DataFrame(predictions)
    df_predictions = df_predictions.T
    df_predictions['mean'] = df_predictions.iloc[:,:4].mean(axis = 1)

    RMSE_test, R2_test, r_test, MAE_test = acc(y_test,df_predictions['mean'])
    scores.append([RMSE_test, R2_test, r_test, MAE_test])

scores_df = pd.DataFrame(scores, columns = ['RMSE', 'R2', 'r', 'MAE'])
scores_list = []

scores_list.append(scores_df.iloc[:xfold,:].mean().values)

Start predicting...


In [32]:
df_scores = pd.DataFrame(scores_list, columns = ['RMSE', 'R2', 'r', 'MAE'])
df_scores.round(3)

Unnamed: 0,RMSE,R2,r,MAE
0,0.104,0.738,0.861,0.07
