In [5]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import seaborn as sns

import rdkit
from rdkit import Chem, DataStructs
from rdkit.Chem import Draw, rdmolops, AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import MACCSkeys


from scipy.stats import pearsonr


# sklearn ML models
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import svm

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import  LGBMRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

from sklearn.model_selection import KFold

import joblib

# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity='all'




# Parameter setup

In [6]:

acceptor_list={
    'BT':21,
    'BTA':20,
    'QA':23,
    'DPP':19,
    'TPD':20,
    'PDI':28,
    'NDI':26,
    'DTBT':20,
    'BBX':21,
    'Y6':29
}

In [7]:
group='1'
acceptor='QA'
dp=acceptor_list[acceptor]
dp

23

In [8]:
folder='raw/group'+group+'/'+acceptor+'/'
folder

'raw/group1/QA/'

# exp gap

In [9]:
df = pd.read_csv(folder+'new-polymer-'+acceptor+'-'+str(dp)+'dp.csv')
df

Unnamed: 0,idx,Name,bandgap(eV),Year,Reference,smiles_monomer,smiles_oligomer
0,1,P1,1.82,2022,https://doi.org/10.1016/j.eurpolymj.2022.111141,CC1=CC2=C(S1)C(OCC(CCCC)CC)=C3C(SC(C4=CC=C(C5=...,COC1=C2C=C(SC2=C(OC)C2=C1SC(C)=C2)C1=CC=C(S1)C...
1,2,P10,1.7,2022,https://doi.org/10.1016/j.eurpolymj.2022.111141,CC1=CC2=C(S1)C(OCC(CCCC)CC)=C3C(SC(C4=CC=C(C5=...,COC1=C2C=C(SC2=C(OC)C2=C1SC(C)=C2)C1=CC=C([Se]...
2,3,P57,1.68,2019,https://doi.org/10.1016/j.orgel.2018.11.022,CCCCCCCCC1=C(CCCCCCCC)C=C(S1)C1=C2C=C(SC2=C(C2...,CC1=CC=C(S1)C1=CC=C(C2=CC=C(S2)C2=CC3=C(C4=CC=...
3,4,P58,1.69,2019,https://doi.org/10.1016/j.orgel.2018.11.022,CCCCCCCCC1=C(CCCCCCCC)C=C(S1)C1=C2C=C(SC2=C(C2...,CC1=CC=C(S1)C1=CC=C(C2=CC=C(S2)C2=CC3=C(C4=CC=...
4,5,P59,1.71,2019,https://doi.org/10.1016/j.orgel.2018.11.022,CCCCCCCCC1=C(CCCCCCCC)C=C(S1)C1=C2C=C(SC2=C(C2...,CC1=CC=C(S1)C1=C(F)C(F)=C(C2=CC=C(S2)C2=CC3=C(...
5,6,P60,1.79,2022,https://doi.org/10.1016/j.eurpolymj.2022.111141,CC(S1)=CC2=C1C(C3=CC=C(CC(CCCC)CC)[Se]3)=C(C=C...,CC1=CC=C(S1)C1=CC=C(C2=CC=C(S2)C2=CC3=C(C4=CC=...
6,7,P61,1.79,2022,https://doi.org/10.1016/j.eurpolymj.2022.111141,CC(S1)=CC2=C1C(C3=CC=C(CC(CCCC)CC)[Se]3)=C(C=C...,CC1=CC2=C(S1)C(C1=CC=C[Se]1)=C1C=C(SC1=C2C1=CC...
7,8,P62,1.67,2020,https://doi.org/10.1016/j.dyepig.2020.108479,CCCCCCCCCCC(CCCCCCCC)CC1=CC=C([Se]1)C1=C2SC(=C...,CC1=CC2=C(S1)C(C1=CC=C[Se]1)=C1C=C(SC1=C2C1=CC...
8,9,P63,1.45,2020,https://doi.org/10.1016/j.dyepig.2020.108479,CC1=CC2=C(S1)C(C3=CC=C(CC(CCCCCCCCCC)CCCCCCCC)...,CC1=CC2=C(S1)C(C1=CC=C[Se]1)=C1C=C(SC1=C2C1=CC...
9,10,P64,1.64,2020,https://doi.org/10.1016/j.dyepig.2020.108479,CC1=CC2=C(S1)C(C3=CC=C(CC(CCCCCCCCCC)CCCCCCCC)...,CC1=CC2=C(S1)C(C1=CC=C[Se]1)=C1C=C(SC1=C2C1=CC...


# exp gap

In [11]:
df_exp = df['bandgap(eV)']
df_exp

0     1.82
1     1.70
2     1.68
3     1.69
4     1.71
5     1.79
6     1.79
7     1.67
8     1.45
9     1.64
10    1.88
11    1.86
12    1.94
13    1.87
14    1.84
15    1.85
16    1.76
17    1.74
18    1.64
19    1.58
20    1.74
21    1.71
22    1.72
Name: bandgap(eV), dtype: float64

# DFT gap

In [12]:
df = pd.read_csv(folder+'oligomer-DFT-'+acceptor+'-'+str(dp)+'dp.csv')
df_dft = df[['HOMO-LUMO(eV)']]
df_dft

Unnamed: 0,HOMO-LUMO(eV)
0,2.26127
1,1.97772
2,2.21011
3,2.18643
4,2.20956
5,2.23786
6,2.04194
7,2.1731
8,2.1633
9,2.23623


# RDKit 209

In [13]:
df_rdkit_screen = pd.read_csv(folder+acceptor+'-RDKit-54features.csv')
df_rdkit_screen = df_rdkit_screen.iloc[:,1:]
df_rdkit_screen

Unnamed: 0,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,FpDensityMorgan1,BalabanJ,PEOE_VSA1,PEOE_VSA11,PEOE_VSA12,...,NumAromaticHeterocycles,NumHAcceptors,fr_C_O,fr_aniline,fr_bicyclic,fr_halogen,fr_imide,fr_ketone_Topliss,fr_oxazole,fr_pyridine
0,7.180939,0.496772,0.496772,0.034891,1544.436,0.296296,1.170647,28.421178,22.998047,0,...,5,12,0,0,3,0,0,0,0,0
1,7.19582,0.071641,-0.071641,0.027551,1638.222,0.333333,1.170647,0.0,0.0,0,...,5,10,0,0,3,0,0,0,0,0
2,5.743851,0.911378,0.911378,0.03701,1276.048,0.306818,1.061164,0.0,0.0,0,...,7,8,0,0,3,0,0,0,0,0
3,14.115373,0.269418,-4.581205,0.029224,1412.042,0.322917,1.078304,0.0,0.0,0,...,7,8,0,0,3,6,0,0,0,0
4,17.886644,0.013036,-4.677777,0.029224,1448.022,0.326531,1.102079,0.0,11.634442,0,...,7,8,0,0,3,8,0,0,0,0
5,7.01346,0.265769,0.265769,0.027961,1770.478,0.310345,1.041986,18.947452,22.998047,0,...,7,10,0,0,3,0,0,0,0,0
6,7.024114,0.034682,-0.034682,0.027961,1864.264,0.318966,1.041986,0.0,0.0,0,...,7,8,0,0,3,0,0,0,0,0
7,5.9643,0.274964,0.274964,0.028047,1650.374,0.333333,1.035658,0.0,0.0,0,...,7,6,0,0,3,0,0,0,0,0
8,18.641756,0.224887,-0.224887,0.028047,1668.364,0.357798,1.047023,0.0,0.0,0,...,7,6,0,0,3,1,0,0,0,0
9,5.9643,0.274964,0.274964,0.028047,1650.374,0.333333,1.035658,0.0,0.0,0,...,7,6,0,0,3,0,0,0,0,0


# MorganFP-1024

In [14]:
Morgan_fingerprints = pd.read_csv(folder+acceptor+'-ECFP6-1024keys.csv')
Morgan_fingerprints = Morgan_fingerprints.iloc[:,1:]
Morgan_fingerprints

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_1014,f_1015,f_1016,f_1017,f_1018,f_1019,f_1020,f_1021,f_1022,f_1023
0,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
8,1,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
9,1,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1


# MACCS-167

In [15]:
MACCS_fingerprints = pd.read_csv(folder+acceptor+'-MACCS-167keys.csv')
MACCS_fingerprints=MACCS_fingerprints.iloc[:,1:]
MACCS_fingerprints

Unnamed: 0,m_0,m_1,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9,...,m_157,m_158,m_159,m_160,m_161,m_162,m_163,m_164,m_165,m_166
0,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,1,1,1,1,1,0
1,0,0,0,1,0,0,0,0,0,0,...,1,0,1,1,1,1,1,1,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,1,1,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,1,1,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,1,1,0,1,0
5,0,0,0,1,0,0,0,0,0,0,...,1,0,1,1,1,1,1,1,1,0
6,0,0,0,1,0,0,0,0,0,0,...,1,0,1,1,1,1,1,1,1,0
7,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,1,1,1,0,1,0
8,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,1,1,1,0,1,0
9,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,1,1,1,0,1,0


# ML model

In [16]:
def acc(y_test,y_pred):
    MSE = mean_squared_error(y_test,y_pred)
    RMSE = MSE ** 0.5
    R2 = r2_score(y_test,y_pred)
    r, p_value = pearsonr(y_test,y_pred)
    MAE = mean_absolute_error(y_test,y_pred)
    return RMSE, R2, r, MAE

In [17]:
y = df_exp
y

0     1.82
1     1.70
2     1.68
3     1.69
4     1.71
5     1.79
6     1.79
7     1.67
8     1.45
9     1.64
10    1.88
11    1.86
12    1.94
13    1.87
14    1.84
15    1.85
16    1.76
17    1.74
18    1.64
19    1.58
20    1.74
21    1.71
22    1.72
Name: bandgap(eV), dtype: float64

In [21]:
# X = pd.concat([df_rdkit_screen, Morgan_fingerprints, MACCS_fingerprints, df_dft], axis = 1)
# X = pd.concat([Morgan_fingerprints, MACCS_fingerprints, df_dft], axis = 1)
# X = pd.concat([df_rdkit_screen, MACCS_fingerprints, df_dft], axis = 1)
# X = pd.concat([df_rdkit_screen, Morgan_fingerprints, df_dft], axis = 1)
X = pd.concat([Morgan_fingerprints, df_dft], axis = 1)
# X = pd.concat([MACCS_fingerprints, df_dft], axis = 1)
# X = pd.concat([df_rdkit_screen, df_dft], axis = 1)
X

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_1015,f_1016,f_1017,f_1018,f_1019,f_1020,f_1021,f_1022,f_1023,HOMO-LUMO(eV)
0,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,2.26127
1,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1.97772
2,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,2.21011
3,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,2.18643
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2.20956
5,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,2.23786
6,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,2.04194
7,1,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,1,2.1731
8,1,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,1,2.1633
9,1,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,1,2.23623


In [22]:
models = {
    'hgbr': HistGradientBoostingRegressor(),
    'lgbm': LGBMRegressor(force_col_wise=True, verbose=-1),
    'gbr': GradientBoostingRegressor(),
    'xgb': XGBRegressor(),
    'adaboost': AdaBoostRegressor(),
    'rf': RandomForestRegressor()
}

In [29]:
folder = '../database-1096dp/02-ML-oligomer-model/6Model-DFT-MorganFP'
model_name='xgb'
foldername = model_name+'-10fold-10subfold'

In [30]:
xfold=10
models = []
for fold_idx in range(xfold):
    for fold_idx_sub in range(xfold):
        model_filename = folder+'/'+foldername + f'/model_fold_{fold_idx + 1}_subfold_{fold_idx_sub + 1}.pkl'
        model = joblib.load(model_filename)
        models.append(model)

len(models)

100

In [31]:
print("Start predicting...")
scores = []

for i in range(xfold):

    predictions = []

    for j in range(xfold):
        model = models[i*xfold+j]     

        y_pred = model.predict(X)
        predictions.append(y_pred)

    df_predictions = pd.DataFrame(predictions)
    df_predictions = df_predictions.T
    df_predictions['mean'] = df_predictions.iloc[:,:4].mean(axis = 1)

    RMSE_test, R2_test, r_test, MAE_test = acc(y,df_predictions['mean'])
    scores.append([RMSE_test, R2_test, r_test, MAE_test])


scores_df = pd.DataFrame(scores, columns = ['RMSE', 'R2', 'r', 'MAE'])
scores_list = []

scores_list.append(scores_df.iloc[:xfold,:].mean().values)

Start predicting...


In [32]:
df_scores = pd.DataFrame(scores_list, columns = ['RMSE', 'R2', 'r', 'MAE'])
df_scores.round(3)

Unnamed: 0,RMSE,R2,r,MAE
0,0.119,-0.219,0.623,0.097
