In [1]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import seaborn as sns

import rdkit
from rdkit import Chem, DataStructs
from rdkit.Chem import Draw, rdmolops, AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import MACCSkeys


from scipy.stats import pearsonr


# sklearn ML models
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import svm

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import  LGBMRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

from sklearn.model_selection import KFold

# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity='all'




# experimental values

In [2]:
df = pd.read_csv('../01-database-preprocessing-1203dp-to-1115dp/raw/atom_number_wH_sort_1115-backbone-correction-newSMILES.csv')
df.head()

Unnamed: 0,Nickname,bandgap(eV),c_smiles,newSMILES,Ref.No
0,P3HT,1.93,CCCCCCc1cc(C)sc1C,Cc1cc(C)c(s1)-c1cc(C)c(s1)-c1cc(C)c(s1)-c1cc(C...,S10
1,P3HST,1.82,CCCCCCSc1cc(C)sc1C,CSc1cc(sc1C)-c1sc(cc1SC)-c1sc(cc1SC)-c1sc(cc1S...,S123
2,POPT,1.76,CCCCCCCCc1ccc(-c2cc(C)sc2C)cc1,Cc1cc(c(s1)-c1cc(c(s1)-c1cc(c(s1)-c1cc(c(s1)-c...,S126
3,PT-C1,1.92,CCCCC(CC)COC(=O)c1cc(C)sc1-c1ccc(C)s1,COC(=O)c1cc(C)sc1-c1ccc(s1)-c1cc(C(=O)OC)c(s1)...,S122
4,PT-C2,1.89,CCCCC(CC)COC(=O)c1cc(C)sc1-c1ccc(-c2ccc(C)s2)s1,COC(=O)c1cc(C)sc1-c1ccc(s1)-c1ccc(s1)-c1cc(C(=...,S122


In [3]:
df_exp = df['bandgap(eV)']
df_exp

0       1.93
1       1.82
2       1.76
3       1.92
4       1.89
        ... 
1110    1.68
1111    1.65
1112    1.66
1113    1.52
1114    1.73
Name: bandgap(eV), Length: 1115, dtype: float64

# TDDFT

In [4]:
df=pd.read_csv('./DFT/tddft-6-31gs.dat', sep = '\s+', header = None)
tddft = df.iloc[1:, 1:,]
tddft.columns = df.iloc[0,1:].values
tddft

Unnamed: 0,S1,S2,S3,S4,S5
1,2.4954,2.9847,3.2513,3.5650,3.6584
2,2.3866,2.8715,3.0195,3.2197,3.3292
3,2.7435,3.1042,3.1566,3.4591,3.4839
4,2.3922,2.8473,3.0424,3.1830,3.4274
5,2.3712,2.7813,3.0169,3.3282,3.4734
...,...,...,...,...,...
1111,1.6818,2.1482,2.5000,2.5551,2.6106
1112,1.6425,,,,
1113,1.8037,2.2365,2.5784,2.6196,2.6845
1114,1.7335,1.9401,2.1471,2.2354,2.4437


In [6]:
tddft.to_csv('./DFT/tddft-6-31gs.csv', index=False)

# DFT HOMO-LUMO gap (B3LYP)

In [7]:
df=pd.read_csv('./DFT/scf-6-31gs.dat', sep = '\s+', header = None)
scf = df.iloc[1:, 1:,]
scf.columns = df.iloc[0,1:].values
scf


scf.to_csv('./DFT/scf-6-31gs.csv', index=False)

## wB97XD

In [5]:
xc='wb97xd'
df=pd.read_csv('./DFT/'+xc+'-6-31gs.dat', sep = '\s+', header = None)
scf = df.iloc[1:, 1:,]
scf.columns = df.iloc[0,1:].values
scf


scf.to_csv('./DFT/'+xc+'-6-31gs.csv', index=False)

## CAM-B3LYP

In [6]:
xc='camb3lyp'
df=pd.read_csv('./DFT/'+xc+'-6-31gs.dat', sep = '\s+', header = None)
scf = df.iloc[1:, 1:,]
scf.columns = df.iloc[0,1:].values
scf


scf.to_csv('./DFT/'+xc+'-6-31gs.csv', index=False)

## PBEPBE

In [7]:
xc='pbepbe'
df=pd.read_csv('./DFT/'+xc+'-6-31gs.dat', sep = '\s+', header = None)
scf = df.iloc[1:, 1:,]
scf.columns = df.iloc[0,1:].values
scf


scf.to_csv('./DFT/'+xc+'-6-31gs.csv', index=False)

# DFT multiple-MOs

In [2]:
def read_homo_lumo(xc_func, drop_list):
    df=pd.read_csv('./DFT/homo-lumo-all-'+xc_func+'.dat', sep = '\s+', header = None)
    scf = df.iloc[1:, 1:,]
    scf.columns = df.iloc[0,1:].values
    scf.to_csv('./DFT/homo-lumo-all-'+xc_func+'.csv', index=False)
    
    scf = pd.read_csv('./DFT/homo-lumo-all-'+xc_func+'.csv')
    df_dft = scf[~scf.index.isin(drop_list)].reset_index(drop=True)
    df_diff  = pd.DataFrame()
    for i in range(1, df_dft.shape[1]):
        df_diff[f'Delta{i}']=df_dft.iloc[:,i] -  df_dft.iloc[:, i-1]
    return df_diff

# drop list

In [5]:
sp3_N_list = [  24,   44,  191,  201,  206,  209,  251,  317,  318,  332,  374,
             381,  388,  454,  913,  931,  936, 1006]
drop_list = sp3_N_list+[691]
print('Total data points: ', 1115-len(drop_list))

Total data points:  1096


### B3LYP

In [6]:
xc_func='B3LYP'  
df_b3lyp = read_homo_lumo(xc_func,drop_list)
df_b3lyp

Unnamed: 0,Delta1,Delta2,Delta3,Delta4,Delta5,Delta6,Delta7,Delta8,Delta9
0,0.07429,0.61171,0.71484,0.58233,2.78726,0.58640,0.619878,0.551031,0.408170
1,0.01361,0.18857,0.47402,0.51484,2.72032,0.54858,0.580420,0.554027,0.531711
2,0.14286,0.35810,0.41824,0.28518,3.19978,0.34205,0.420140,0.377153,0.217963
3,0.17469,0.46069,0.69389,0.55784,2.69964,0.46885,0.360280,0.878925,0.082995
4,0.09605,0.53634,0.77770,0.54831,2.66372,0.42450,0.754571,0.337965,0.573616
...,...,...,...,...,...,...,...,...,...
1091,0.46668,0.11918,0.36300,0.47702,1.93038,0.91185,0.139870,0.455240,0.340960
1092,0.07157,0.27048,0.24272,0.26504,1.93065,0.22340,0.930090,0.110480,0.358644
1093,0.11619,0.12463,0.33524,0.40055,2.04439,0.96383,0.523550,0.403271,0.220956
1094,0.54314,0.35947,0.61769,0.20735,1.94562,0.24436,0.651160,0.485460,0.358100


### wB97XD

In [7]:
xc_func='wB97XD'   
df_wb97xd = read_homo_lumo(xc_func,drop_list)
df_wb97xd

Unnamed: 0,Delta1,Delta2,Delta3,Delta4,Delta5,Delta6,Delta7,Delta8,Delta9
0,0.08680,0.69389,0.83974,0.68219,6.071678,0.691985,0.736617,0.643270,0.472390
1,0.00571,0.27103,0.63158,0.61688,6.012624,0.653345,0.685183,0.645178,0.600010
2,0.17224,0.39266,0.48192,0.34041,6.612367,0.400552,0.491165,0.436196,0.247350
3,0.21524,0.54233,0.82450,0.66097,5.979424,0.573888,0.454974,0.989954,0.122990
4,0.10857,0.63321,0.92328,0.64573,5.938608,0.517560,0.885459,0.417963,0.642460
...,...,...,...,...,...,...,...,...,...
1091,0.56164,0.12789,0.46913,0.55103,5.042540,1.126822,0.226126,0.437831,0.391572
1092,0.09715,0.35592,0.30749,0.29443,5.033020,0.386125,0.983419,0.044354,0.500418
1093,0.09661,0.12871,0.42966,0.49035,5.213430,1.083830,0.599738,0.513482,0.256600
1094,0.64001,0.43294,0.71811,0.24816,5.035470,0.260410,0.801650,0.584500,0.422049


### CAM-B3LYP

In [8]:
xc_func='CAM-B3LYP'    
df_camb3lyp = read_homo_lumo(xc_func, drop_list)
df_camb3lyp

Unnamed: 0,Delta1,Delta2,Delta3,Delta4,Delta5,Delta6,Delta7,Delta8,Delta9
0,0.09061,0.68926,0.82097,0.66641,4.983762,0.671576,0.718381,0.637291,0.476470
1,0.01388,0.27919,0.62559,0.60246,4.920090,0.633477,0.670488,0.638107,0.598648
2,0.17252,0.38776,0.47212,0.33388,5.500240,0.391300,0.480825,0.431844,0.249801
3,0.20708,0.54286,0.80573,0.64464,4.887980,0.554024,0.440280,0.988589,0.126533
4,0.10558,0.62859,0.90314,0.62940,4.848800,0.498782,0.869676,0.410076,0.653346
...,...,...,...,...,...,...,...,...,...
1091,0.55185,0.12681,0.45388,0.53906,3.971230,1.103420,0.199732,0.452525,0.387762
1092,0.10204,0.34695,0.30069,0.28980,3.953540,0.354560,0.990496,0.056600,0.477287
1093,0.09660,0.12463,0.42476,0.47947,4.123340,1.075666,0.587765,0.498512,0.258508
1094,0.62913,0.42341,0.71076,0.23647,3.963610,0.255240,0.776886,0.568990,0.416062


### PBEPBE

In [9]:
xc_func='PBEPBE'   
df_pbepbe = read_homo_lumo(xc_func, drop_list)
df_pbepbe

Unnamed: 0,Delta1,Delta2,Delta3,Delta4,Delta5,Delta6,Delta7,Delta8,Delta9
0,0.05170,0.52246,0.62423,0.49933,1.76302,0.50532,0.53525,0.474019,0.339870
1,0.03537,0.10912,0.27320,0.41008,1.68792,0.46831,0.50232,0.481100,0.472659
2,0.10721,0.33117,0.36871,0.23511,2.08357,0.29606,0.36572,0.324360,0.183950
3,0.15184,0.31129,0.59702,0.47511,1.68303,0.39320,0.29824,0.760280,0.059050
4,0.08136,0.40246,0.66641,0.46803,1.65500,0.35756,0.64409,0.290070,0.480829
...,...,...,...,...,...,...,...,...,...
1091,0.39211,0.12599,0.27974,0.40490,1.02424,0.73226,0.16027,0.406270,0.302040
1092,0.04598,0.17089,0.18041,0.22939,1.03757,0.14314,0.84709,0.127890,0.297150
1093,0.17171,0.11510,0.22885,0.31538,1.11594,0.85226,0.46259,0.352390,0.148840
1094,0.34232,0.32436,0.55049,0.15755,1.04383,0.20953,0.55810,0.416060,0.305040


In [10]:
xc_func_list = ['B3LYP', '$\omega$B97XD', 'CAM-B3LYP', 'PBEPBE']
feature = pd.concat([df_b3lyp['Delta5'], df_wb97xd['Delta5'], df_camb3lyp['Delta5'], df_pbepbe['Delta5']], axis = 1)
feature.columns = xc_func_list
feature



Unnamed: 0,B3LYP,$\omega$B97XD,CAM-B3LYP,PBEPBE
0,2.78726,6.071678,4.983762,1.76302
1,2.72032,6.012624,4.920090,1.68792
2,3.19978,6.612367,5.500240,2.08357
3,2.69964,5.979424,4.887980,1.68303
4,2.66372,5.938608,4.848800,1.65500
...,...,...,...,...
1091,1.93038,5.042540,3.971230,1.02424
1092,1.93065,5.033020,3.953540,1.03757
1093,2.04439,5.213430,4.123340,1.11594
1094,1.94562,5.035470,3.963610,1.04383
