In [1]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import seaborn as sns

import rdkit
from rdkit import Chem, DataStructs
from rdkit.Chem import Draw, rdmolops, AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import MACCSkeys


from scipy.stats import pearsonr


# sklearn ML models
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import svm

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import  LGBMRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

from sklearn.model_selection import KFold

import joblib

# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity='all'




# exp HOMO

In [2]:
df=pd.read_csv('../raw/conjugated-polymer-1096dp.csv')
df

Unnamed: 0,idx,Nickname,HOMO(eV),LUMO(eV),bandgap(eV),Ref.No,smiles_monomer,smiles_oligomer
0,1,P3HT,-5.20,-3.27,1.93,S10,CCCCCCc1cc(C)sc1C,Cc1cc(C)c(s1)-c1cc(C)c(s1)-c1cc(C)c(s1)-c1cc(C...
1,2,P3HST,-4.90,-3.08,1.82,S123,CCCCCCSc1cc(C)sc1C,CSc1cc(sc1C)-c1sc(cc1SC)-c1sc(cc1SC)-c1sc(cc1S...
2,3,POPT,-5.34,-3.58,1.76,S126,CCCCCCCCc1ccc(-c2cc(C)sc2C)cc1,Cc1cc(c(s1)-c1cc(c(s1)-c1cc(c(s1)-c1cc(c(s1)-c...
3,4,PT-C1,-5.15,-3.23,1.92,S122,CCCCC(CC)COC(=O)c1cc(C)sc1-c1ccc(C)s1,COC(=O)c1cc(C)sc1-c1ccc(s1)-c1cc(C(=O)OC)c(s1)...
4,5,PT-C2,-5.11,-3.22,1.89,S122,CCCCC(CC)COC(=O)c1cc(C)sc1-c1ccc(-c2ccc(C)s2)s1,COC(=O)c1cc(C)sc1-c1ccc(s1)-c1ccc(s1)-c1cc(C(=...
...,...,...,...,...,...,...,...,...
1091,1092,BTT-NTz,-5.41,-3.73,1.68,S115,CCCCCCCCCCCCc1cc(C)sc1-c1cc2c3sc(C(CCCCCCCC)CC...,Cc1cc(C)c(s1)-c1cc2c3sc(C)cc3c3cc(sc3c2s1)-c1s...
1092,1093,PIDTI-BT,-5.30,-3.65,1.65,S441,CCCCCCCCCCCCC(CCCCCCCCCC)Cn1c2cc3c(cc2c2sc(C)c...,Cc1cc2n(C)c3cc4-c5cc6c(cc5C(c4cc3c2s1)(c1ccccc...
1093,1094,PIDTI-DTBT,-5.26,-3.60,1.66,S441,CCCCCCCCCCCCC(CCCCCCCCCC)Cn1c2cc3c(cc2c2sc(C)c...,Cc1ccc(s1)-c1ccc(-c2ccc(s2)-c2cc3n(C)c4cc5-c6c...
1094,1095,poly(DPP4T-alt-TBP),-5.42,-3.90,1.52,S464,CCCCCCCCCCCCC(CCCCCCCCCC)CN1C(=O)C2=C(c3ccc(-c...,CN1C(=O)C2=C(N(C)C(=O)C2=C1c1ccc(C)s1)c1ccc(s1...


In [3]:
df_exp = df['HOMO(eV)']
df_exp

0      -5.20
1      -4.90
2      -5.34
3      -5.15
4      -5.11
        ... 
1091   -5.41
1092   -5.30
1093   -5.26
1094   -5.42
1095   -5.53
Name: HOMO(eV), Length: 1096, dtype: float64

# RDKit-209

In [4]:
df_rdkit_screen = pd.read_csv('../raw/monomer-rdkit-54features-1096dp.csv')
df_rdkit_screen = df_rdkit_screen.iloc[:,1:]
df_rdkit_screen

Unnamed: 0,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,FpDensityMorgan1,BalabanJ,PEOE_VSA1,PEOE_VSA11,PEOE_VSA12,...,NumAromaticHeterocycles,NumHAcceptors,fr_C_O,fr_aniline,fr_bicyclic,fr_halogen,fr_imide,fr_ketone_Topliss,fr_oxazole,fr_pyridine
0,2.353121,1.287870,1.287870,0.608740,196.359,1.153846,2.318302,0.000000,0.0,0.0,...,1,1,0,0,0,0,0,0,0,0
1,2.323896,1.288704,1.288704,0.485488,228.426,1.214286,2.209145,0.000000,0.0,0.0,...,1,2,0,0,0,0,0,0,0,0
2,2.313310,1.230210,1.230210,0.463323,300.511,0.857143,1.833656,0.000000,0.0,0.0,...,1,1,0,0,0,0,0,0,0,0
3,12.542211,0.175600,-0.175600,0.510163,350.549,1.173913,2.074957,4.736863,0.0,0.0,...,2,4,1,0,0,0,0,0,0,0
4,12.817589,0.184059,-0.184059,0.318494,432.676,0.964286,1.756705,4.736863,0.0,0.0,...,3,5,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1091,5.448759,0.613257,0.613257,0.035659,2093.634,0.211268,0.950806,0.000000,0.0,0.0,...,9,13,0,0,11,0,0,0,0,0
1092,6.964899,0.506884,-0.861916,0.035381,2097.349,0.245033,0.981006,28.081651,0.0,0.0,...,5,11,0,0,13,0,0,0,0,0
1093,7.004105,0.497358,-0.874785,0.035381,2261.603,0.223602,0.904925,28.081651,0.0,0.0,...,7,13,0,0,13,0,0,0,0,0
1094,17.056452,0.015571,0.015571,0.035580,2289.808,0.162500,0.900316,19.599639,0.0,0.0,...,6,10,4,0,2,0,0,0,0,0


# MorganFP-1024

In [5]:
Morgan_fingerprints = pd.read_csv('../raw/monomer-ECFP6-1024keys-1096dp.csv')
Morgan_fingerprints = Morgan_fingerprints.iloc[:,1:]
Morgan_fingerprints

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_1014,f_1015,f_1016,f_1017,f_1018,f_1019,f_1020,f_1021,f_1022,f_1023
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1091,0,1,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
1092,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1093,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1094,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


# MACCS-167

In [6]:
MACCS_fingerprints = pd.read_csv('../raw/monomer-MACCS-167keys-1096dp.csv')
MACCS_fingerprints = MACCS_fingerprints.iloc[:,1:]
MACCS_fingerprints


Unnamed: 0,m_0,m_1,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9,...,m_157,m_158,m_159,m_160,m_161,m_162,m_163,m_164,m_165,m_166
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,1,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,1,0,1,1,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1091,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,1,1,0,1,0
1092,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
1093,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
1094,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0


# DFT HOMO

In [13]:
df = pd.read_csv('../raw/oligomer-DFT-TDDFT-calculations-1096dp.csv')
df_dft = df[['HOMO-B3LYP(eV)']]
df_dft = df_dft.rename(columns={'HOMO-B3LYP(eV)':'HOMO(eV)'})
df_dft

Unnamed: 0,HOMO(eV)
0,-4.58593
1,-4.99329
2,-4.97206
3,-4.90784
4,-4.83002
...,...
1091,-4.80172
1092,-4.52852
1093,-4.61206
1094,-4.69043


# ML regression

In [11]:
def acc(y_test,y_pred):
    MSE = mean_squared_error(y_test,y_pred)
    RMSE = MSE ** 0.5
    R2 = r2_score(y_test,y_pred)
    r, p_value = pearsonr(y_test,y_pred)
    MAE = mean_absolute_error(y_test,y_pred)
    return RMSE, R2, r, MAE

In [12]:
y = df_exp
y

0      -5.20
1      -4.90
2      -5.34
3      -5.15
4      -5.11
        ... 
1091   -5.41
1092   -5.30
1093   -5.26
1094   -5.42
1095   -5.53
Name: HOMO(eV), Length: 1096, dtype: float64

In [15]:
# X = pd.concat([df_rdkit_screen, Morgan_fingerprints, MACCS_fingerprints], axis = 1)
X = pd.concat([Morgan_fingerprints, df_dft], axis = 1)
X

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_1015,f_1016,f_1017,f_1018,f_1019,f_1020,f_1021,f_1022,f_1023,HOMO(eV)
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-4.58593
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-4.99329
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-4.97206
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-4.90784
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-4.83002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1091,0,1,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,-4.80172
1092,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,-4.52852
1093,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,-4.61206
1094,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,-4.69043


In [16]:
model = XGBRegressor()


In [17]:
foldername = 'xgb-oligomer-model-HOMO'
os.makedirs(foldername, exist_ok=True)

In [18]:
# 10fold-CV plus 10fold-CV average
xfold=10
kf = KFold(n_splits=xfold, shuffle=True, random_state=42)

# save index for train and test of each fold
train_idx_list = []
test_idx_list = []
print('Start training...')

for fold_idx, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    train_idx_list.append(train_index)
    test_idx_list.append(test_index)

    kf_sub = KFold(n_splits=xfold, shuffle=True, random_state=42)
    for fold_idx_sub, (train_index_sub, test_index_sub) in enumerate(kf_sub.split(X_train)):
        X_kf_train, X_kf_test = X_train.iloc[train_index_sub], X_train.iloc[test_index_sub]
        y_kf_train, y_kf_test = y_train.iloc[train_index_sub], y_train.iloc[test_index_sub]

        # 训练模型
        model.fit(X_kf_train, y_kf_train)

        # 保存模型
        model_filename = foldername + f'/model_fold_{fold_idx + 1}_subfold_{fold_idx_sub + 1}.pkl'
        joblib.dump(model, model_filename)


Start training...


# model loading

In [19]:
foldername = 'xgb-oligomer-model-HOMO'

In [20]:
models = []
for fold_idx in range(xfold):
    for fold_idx_sub in range(xfold):
        model_filename = foldername + f'/model_fold_{fold_idx + 1}_subfold_{fold_idx_sub + 1}.pkl'
        model = joblib.load(model_filename)
        models.append(model)
        
len(models)

100

In [21]:
print("Start predicting...")
scores = []

for i in range(xfold):
    train_index = train_idx_list[i]
    test_index = test_idx_list[i]

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index] 

    predictions = []

    for j in range(xfold):
        model = models[i*xfold+j]     

        y_pred = model.predict(X_test)
        predictions.append(y_pred)

    df_predictions = pd.DataFrame(predictions)
    df_predictions = df_predictions.T
    df_predictions['mean'] = df_predictions.iloc[:,:4].mean(axis = 1)

    RMSE_test, R2_test, r_test, MAE_test = acc(y_test,df_predictions['mean'])
    scores.append([RMSE_test, R2_test, r_test, MAE_test])


Start predicting...


In [22]:
scores_df = pd.DataFrame(scores, columns = ['RMSE', 'R2', 'r', 'MAE'])
scores_df.loc['mean'] = scores_df.iloc[:xfold,:].mean().values
scores_df.round(3)


Unnamed: 0,RMSE,R2,r,MAE
0,0.163,0.478,0.691,0.104
1,0.174,0.461,0.711,0.117
2,0.167,0.454,0.683,0.115
3,0.157,0.542,0.737,0.101
4,0.155,0.497,0.711,0.106
5,0.165,0.441,0.668,0.123
6,0.158,0.504,0.725,0.106
7,0.146,0.559,0.75,0.104
8,0.149,0.541,0.736,0.112
9,0.144,0.556,0.748,0.102
