In [1]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import seaborn as sns

import rdkit
from rdkit import Chem, DataStructs
from rdkit.Chem import Draw, rdmolops, AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import MACCSkeys


from scipy.stats import pearsonr


# sklearn ML models
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import svm

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import  LGBMRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.model_selection import KFold

import joblib

# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity='all'




# exp HOMO-LUMO gap

In [2]:
df=pd.read_csv('../01-database-preprocessing-1203dp-to-1115dp/raw/atom_number_wH_sort_1115-backbone-correction-newSMILES.csv')
df

Unnamed: 0,Nickname,bandgap(eV),c_smiles,newSMILES,Ref.No,HOMO(eV),LUMO(eV)
0,P3HT,1.93,CCCCCCc1cc(C)sc1C,Cc1cc(C)c(s1)-c1cc(C)c(s1)-c1cc(C)c(s1)-c1cc(C...,S10,-5.20,-3.27
1,P3HST,1.82,CCCCCCSc1cc(C)sc1C,CSc1cc(sc1C)-c1sc(cc1SC)-c1sc(cc1SC)-c1sc(cc1S...,S123,-4.90,-3.08
2,POPT,1.76,CCCCCCCCc1ccc(-c2cc(C)sc2C)cc1,Cc1cc(c(s1)-c1cc(c(s1)-c1cc(c(s1)-c1cc(c(s1)-c...,S126,-5.34,-3.58
3,PT-C1,1.92,CCCCC(CC)COC(=O)c1cc(C)sc1-c1ccc(C)s1,COC(=O)c1cc(C)sc1-c1ccc(s1)-c1cc(C(=O)OC)c(s1)...,S122,-5.15,-3.23
4,PT-C2,1.89,CCCCC(CC)COC(=O)c1cc(C)sc1-c1ccc(-c2ccc(C)s2)s1,COC(=O)c1cc(C)sc1-c1ccc(s1)-c1ccc(s1)-c1cc(C(=...,S122,-5.11,-3.22
...,...,...,...,...,...,...,...
1110,BTT-NTz,1.68,CCCCCCCCCCCCc1cc(C)sc1-c1cc2c3sc(C(CCCCCCCC)CC...,Cc1cc(C)c(s1)-c1cc2c3sc(C)cc3c3cc(sc3c2s1)-c1s...,S115,-5.41,-3.73
1111,PIDTI-BT,1.65,CCCCCCCCCCCCC(CCCCCCCCCC)Cn1c2cc3c(cc2c2sc(C)c...,Cc1cc2n(C)c3cc4-c5cc6c(cc5C(c4cc3c2s1)(c1ccccc...,S441,-5.30,-3.65
1112,PIDTI-DTBT,1.66,CCCCCCCCCCCCC(CCCCCCCCCC)Cn1c2cc3c(cc2c2sc(C)c...,Cc1ccc(s1)-c1ccc(-c2ccc(s2)-c2cc3n(C)c4cc5-c6c...,S441,-5.26,-3.60
1113,poly(DPP4T-alt-TBP),1.52,CCCCCCCCCCCCC(CCCCCCCCCC)CN1C(=O)C2=C(c3ccc(-c...,CN1C(=O)C2=C(N(C)C(=O)C2=C1c1ccc(C)s1)c1ccc(s1...,S464,-5.42,-3.90


In [3]:
df_exp = df['bandgap(eV)']
df_exp

0       1.93
1       1.82
2       1.76
3       1.92
4       1.89
        ... 
1110    1.68
1111    1.65
1112    1.66
1113    1.52
1114    1.73
Name: bandgap(eV), Length: 1115, dtype: float64

# Read RDKit-209 features

In [4]:
rdkit_feature = pd.read_csv('monomer-1115dp-RDKit209.csv')
rdkit_feature

features = pd.read_csv('rdkit_feature_list.csv')
rdkit_feature_list = features.columns
rdkit_feature_list

df_rdkit_screen = rdkit_feature[rdkit_feature_list]
df_rdkit_screen

Unnamed: 0,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,FpDensityMorgan1,BalabanJ,PEOE_VSA1,PEOE_VSA11,PEOE_VSA12,...,NumAromaticHeterocycles,NumHAcceptors,fr_C_O,fr_aniline,fr_bicyclic,fr_halogen,fr_imide,fr_ketone_Topliss,fr_oxazole,fr_pyridine
0,2.353121,1.287870,1.287870,0.608740,196.359,1.153846,2.318302,0.000000,0.0,0.0,...,1,1,0,0,0,0,0,0,0,0
1,2.323896,1.288704,1.288704,0.485488,228.426,1.214286,2.209145,0.000000,0.0,0.0,...,1,2,0,0,0,0,0,0,0,0
2,2.313310,1.230210,1.230210,0.463323,300.511,0.857143,1.833656,0.000000,0.0,0.0,...,1,1,0,0,0,0,0,0,0,0
3,12.542211,0.175600,-0.175600,0.510163,350.549,1.173913,2.074957,4.736863,0.0,0.0,...,2,4,1,0,0,0,0,0,0,0
4,12.817589,0.184059,-0.184059,0.318494,432.676,0.964286,1.756705,4.736863,0.0,0.0,...,3,5,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1110,5.448759,0.613257,0.613257,0.035659,2093.634,0.211268,0.950806,0.000000,0.0,0.0,...,9,13,0,0,11,0,0,0,0,0
1111,6.964899,0.506884,-0.861916,0.035381,2097.349,0.245033,0.981006,28.081651,0.0,0.0,...,5,11,0,0,13,0,0,0,0,0
1112,7.004105,0.497358,-0.874785,0.035381,2261.603,0.223602,0.904925,28.081651,0.0,0.0,...,7,13,0,0,13,0,0,0,0,0
1113,17.056452,0.015571,0.015571,0.035580,2289.808,0.162500,0.900316,19.599639,0.0,0.0,...,6,10,4,0,2,0,0,0,0,0


# ECFP descriptors generation

In [5]:
Morgan_fingerprints = pd.read_csv('monomer-1115dp-MorganFP-1024.csv')
Morgan_fingerprints

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_1014,f_1015,f_1016,f_1017,f_1018,f_1019,f_1020,f_1021,f_1022,f_1023
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1110,0,1,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
1111,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1112,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1113,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


# MACCS-167 descriptors generation

In [6]:
MACCS_fingerprints = pd.read_csv('monomer-1115dp-maccs-167.csv')
MACCS_fingerprints


Unnamed: 0,m_0,m_1,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9,...,m_157,m_158,m_159,m_160,m_161,m_162,m_163,m_164,m_165,m_166
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,1,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,1,0,1,1,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1110,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,1,1,0,1,0
1111,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
1112,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
1113,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0


# drop sp3-N polymers + donor-692

In [7]:
# non_alkyl_idx = [46,49,68,79,202,210,217,252,255,262,273,274,318,355,358,375,441,
#                 455,810,812,914,932,934,937,947,1007]

sp3_N_list = [  24,   44,  191,  201,  206,  209,  251,  317,  318,  332,  374,
             381,  388,  454,  913,  931,  936, 1006]
drop_list = sp3_N_list+[691]
print('Total data points: ', 1115-len(drop_list))

Total data points:  1096


In [8]:
df_rdkit_screen = df_rdkit_screen[~df_rdkit_screen.index.isin(drop_list)].reset_index(drop=True)

Morgan_fingerprints = Morgan_fingerprints[~Morgan_fingerprints.index.isin(drop_list)].reset_index(drop=True)

MACCS_fingerprints = MACCS_fingerprints[~MACCS_fingerprints.index.isin(drop_list)].reset_index(drop=True)

df_exp = df_exp[~df_exp.index.isin(drop_list)].reset_index(drop=True)

# ML regression

In [9]:
def acc(y_test,y_pred):
    MSE = mean_squared_error(y_test,y_pred)
    RMSE = MSE ** 0.5
    R2 = r2_score(y_test,y_pred)
#     p = pearsonr(y_test,y_pred.reshape(-1,1)) # y_pred shape = (xxx,)
    r, p_value = pearsonr(y_test,y_pred) # y_pred shape = (xxx,)
    MAE = mean_absolute_error(y_test,y_pred)
    return RMSE, R2, r, MAE

In [10]:
y = df_exp
y

0       1.93
1       1.82
2       1.76
3       1.92
4       1.89
        ... 
1091    1.68
1092    1.65
1093    1.66
1094    1.52
1095    1.73
Name: bandgap(eV), Length: 1096, dtype: float64

In [11]:
# X = pd.concat([df_rdkit_screen, Morgan_fingerprints, MACCS_fingerprints], axis = 1)
X = pd.concat([MACCS_fingerprints], axis = 1)
X

Unnamed: 0,m_0,m_1,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9,...,m_157,m_158,m_159,m_160,m_161,m_162,m_163,m_164,m_165,m_166
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,1,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,1,0,1,1,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1091,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,1,1,0,1,0
1092,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
1093,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
1094,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0


In [12]:
models = {
    'hgbr': HistGradientBoostingRegressor(),
    'lgbm': LGBMRegressor(force_col_wise=True, verbose=-1),
    'gbr': GradientBoostingRegressor(),
    'xgb': XGBRegressor(),
    'adaboost': AdaBoostRegressor(),
    'rf': RandomForestRegressor()
}

In [14]:
folder = '6Model-MACCS'
os.makedirs(folder, exist_ok=True)

In [15]:
scores_list = []

for name, model in models.items():
    
    # 10fold-CV plus 10fold-CV average
    foldername = name+'-10fold-10subfold'
    os.makedirs(folder+'/'+foldername, exist_ok=True)
    xfold=10
    kf = KFold(n_splits=xfold, shuffle=True, random_state=42)

    # save index for train and test of each fold
    train_idx_list = []
    test_idx_list = []
    print('Start training: ',name)

    for fold_idx, (train_index, test_index) in enumerate(kf.split(X)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        train_idx_list.append(train_index)
        test_idx_list.append(test_index)

        kf_sub = KFold(n_splits=xfold, shuffle=True, random_state=42)
        for fold_idx_sub, (train_index_sub, test_index_sub) in enumerate(kf_sub.split(X_train)):
            X_kf_train, X_kf_test = X_train.iloc[train_index_sub], X_train.iloc[test_index_sub]
            y_kf_train, y_kf_test = y_train.iloc[train_index_sub], y_train.iloc[test_index_sub]

            # 训练模型
            model.fit(X_kf_train, y_kf_train)

            # 保存模型
            model_filename = folder+'/'+foldername + f'/model_fold_{fold_idx + 1}_subfold_{fold_idx_sub + 1}.pkl'
            joblib.dump(model, model_filename)
            
# load 10-fold cv plus 10-fold cv average

    models = []
    for fold_idx in range(xfold):
        for fold_idx_sub in range(xfold):
            model_filename = folder+'/'+foldername + f'/model_fold_{fold_idx + 1}_subfold_{fold_idx_sub + 1}.pkl'
            model = joblib.load(model_filename)
            models.append(model)

    print("Start predicting...")
    scores = []

    for i in range(xfold):
        train_index = train_idx_list[i]
        test_index = test_idx_list[i]

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index] 

        predictions = []

        for j in range(xfold):
            model = models[i*xfold+j]     

            y_pred = model.predict(X_test)
            predictions.append(y_pred)

        df_predictions = pd.DataFrame(predictions)
        df_predictions = df_predictions.T
        df_predictions['mean'] = df_predictions.iloc[:,:4].mean(axis = 1)

        RMSE_test, R2_test, r_test, MAE_test = acc(y_test,df_predictions['mean'])
        scores.append([RMSE_test, R2_test, r_test, MAE_test])

    scores_df = pd.DataFrame(scores, columns = ['RMSE', 'R2', 'r', 'MAE'])
    scores_list.append(scores_df.iloc[:xfold,:].mean().values)
                

Start training:  hgbr
Start predicting...
Start training:  lgbm
Start predicting...
Start training:  gbr
Start predicting...
Start training:  xgb
Start predicting...
Start training:  adaboost
Start predicting...
Start training:  rf
Start predicting...


In [16]:
df_scores = pd.DataFrame(scores_list, columns = ['RMSE', 'R2', 'r', 'MAE'])
model_label = ['HGBR', 'LGBM', 'GBR', 'XGB', 'Adaboost', 'RF']
df_scores.index = model_label
df_scores.round(3)

Unnamed: 0,RMSE,R2,r,MAE
HGBR,0.146,0.484,0.7,0.103
LGBM,0.146,0.484,0.7,0.103
GBR,0.145,0.489,0.705,0.105
XGB,0.143,0.5,0.716,0.095
Adaboost,0.168,0.314,0.569,0.128
RF,0.139,0.527,0.728,0.095


In [17]:
df_tmp = df_scores.round(3)
df_tmp.to_csv(folder+'.csv')

# Model load

In [18]:
xfold=10

models = []
foldername = 'xgb-10fold-10subfold'
for fold_idx in range(xfold):
    for fold_idx_sub in range(xfold):
        model_filename = folder+'/'+foldername + f'/model_fold_{fold_idx + 1}_subfold_{fold_idx_sub + 1}.pkl'
        model = joblib.load(model_filename)
        models.append(model)

len(models)

100

In [19]:
xfold=10
kf = KFold(n_splits=xfold, shuffle=True, random_state=42)

# save index for train and test of each fold
train_idx_list = []
test_idx_list = []
for fold_idx, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    train_idx_list.append(train_index)
    test_idx_list.append(test_index)

print("Start predicting...")
scores = []

for i in range(xfold):
    train_index = train_idx_list[i]
    test_index = test_idx_list[i]

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index] 

    predictions = []

    for j in range(xfold):
        model = models[i*xfold+j]     

        y_pred = model.predict(X_test)
        predictions.append(y_pred)

    df_predictions = pd.DataFrame(predictions)
    df_predictions = df_predictions.T
    df_predictions['mean'] = df_predictions.iloc[:,:4].mean(axis = 1)

    RMSE_test, R2_test, r_test, MAE_test = acc(y_test,df_predictions['mean'])
    scores.append([RMSE_test, R2_test, r_test, MAE_test])

scores_df = pd.DataFrame(scores, columns = ['RMSE', 'R2', 'r', 'MAE'])
scores_list = []

scores_list.append(scores_df.iloc[:xfold,:].mean().values)

Start predicting...


In [20]:
df_scores = pd.DataFrame(scores_list, columns = ['RMSE', 'R2', 'r', 'MAE'])
# df_scores.index = X_label
df_scores.round(3)

Unnamed: 0,RMSE,R2,r,MAE
0,0.143,0.5,0.716,0.095
