### Avalon, Avalon Count, Layered, Pattern, RDKitFP

For all 4 of them, make the 4096 bits and then reduce to 256 or 512 bits

### Load Dataset

In [1]:
import numpy as np
import pandas as pd
import time
from rdkit import Chem
from rdkit.Chem import AllChem, rdForceFieldHelpers, rdmolops, rdMolDescriptors
from rdkit.Avalon import pyAvalonTools
from sklearn.feature_selection import RFE
import lightgbm as lgb

rfe = RFE(estimator=lgb.LGBMRegressor(), n_features_to_select=512, step=64)

In [2]:
polymers = pd.read_excel('Bandgap_chain4209.xlsx')
print(polymers.shape)
polymers.head()

(4209, 2)


Unnamed: 0,SMILES,bandgap_chain
0,[*]C[*],6.8063
1,[*]CC([*])C,6.4609
2,[*]CC([*])CC,6.6228
3,[*]CC([*])CCC,6.738
4,[*]CC([*])CC(C)C,6.7268


### Using SMILES

In [3]:
start = time.time()
monomers = []

for m in polymers['SMILES']:
    mole = Chem.MolFromSmiles(m)
    monomers.append(mole)

stop = time.time()

print()
print('runtime:', stop - start, 's') 
print()
monomers


runtime: 0.6253659725189209 s



[<rdkit.Chem.rdchem.Mol at 0x1bc9085f520>,
 <rdkit.Chem.rdchem.Mol at 0x1bc9085f580>,
 <rdkit.Chem.rdchem.Mol at 0x1bc9085f5e0>,
 <rdkit.Chem.rdchem.Mol at 0x1bc9085f640>,
 <rdkit.Chem.rdchem.Mol at 0x1bc9085f6a0>,
 <rdkit.Chem.rdchem.Mol at 0x1bc9085f700>,
 <rdkit.Chem.rdchem.Mol at 0x1bc9085f760>,
 <rdkit.Chem.rdchem.Mol at 0x1bc9085f7c0>,
 <rdkit.Chem.rdchem.Mol at 0x1bc9085f820>,
 <rdkit.Chem.rdchem.Mol at 0x1bc9085f880>,
 <rdkit.Chem.rdchem.Mol at 0x1bc9085f8e0>,
 <rdkit.Chem.rdchem.Mol at 0x1bc9085f940>,
 <rdkit.Chem.rdchem.Mol at 0x1bc9085f9a0>,
 <rdkit.Chem.rdchem.Mol at 0x1bc9085fa00>,
 <rdkit.Chem.rdchem.Mol at 0x1bc9085fa60>,
 <rdkit.Chem.rdchem.Mol at 0x1bc9085fac0>,
 <rdkit.Chem.rdchem.Mol at 0x1bc9085fb20>,
 <rdkit.Chem.rdchem.Mol at 0x1bc9085fb80>,
 <rdkit.Chem.rdchem.Mol at 0x1bc9085fbe0>,
 <rdkit.Chem.rdchem.Mol at 0x1bc9085fc40>,
 <rdkit.Chem.rdchem.Mol at 0x1bc9085fca0>,
 <rdkit.Chem.rdchem.Mol at 0x1bc9085fd00>,
 <rdkit.Chem.rdchem.Mol at 0x1bc9085fd60>,
 <rdkit.Che

### Avalon fingerprints

In [4]:
def avalon_calculator(bits, monomers):
    # Avalon FP #############################################
    start = time.time()

    ava_header = []
    for i in range(bits):
        ava_header.append('Avalon_'+ str(i+1))

    avalon = []
    for m in monomers:
        ava = list(pyAvalonTools.GetAvalonFP(m, nBits=bits))
        avalon.append(ava)

    ava_fp = pd.DataFrame(avalon, columns=ava_header)
    feature_names = np.array(ava_fp.columns)
    selecting = rfe.fit(ava_fp.to_numpy(), polymers['bandgap_chain'].to_numpy())
    selections = feature_names[selecting.get_support()]

    stop = time.time()

    print('runtime:', stop - start, 's')
    ava_final = pd.concat([polymers, ava_fp[selections]], axis=1)

    return ava_final

In [5]:
ava_rfe = avalon_calculator(4096, monomers)
ava_rfe

runtime: 46.52402639389038 s


Unnamed: 0,SMILES,bandgap_chain,Avalon_5,Avalon_15,Avalon_20,Avalon_24,Avalon_35,Avalon_37,Avalon_42,Avalon_48,...,Avalon_4016,Avalon_4018,Avalon_4029,Avalon_4033,Avalon_4047,Avalon_4049,Avalon_4060,Avalon_4071,Avalon_4075,Avalon_4090
0,[*]C[*],6.8063,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,[*]CC([*])C,6.4609,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,[*]CC([*])CC,6.6228,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,[*]CC([*])CCC,6.7380,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,[*]CC([*])CC(C)C,6.7268,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,[*]CCCCCC[N+](C)(C)CCCCC[N+]([*])(C)C,0.1118,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4205,[*]C#Cc1cc(OCCCCCCCCCCCC)c(C#Cc2ccc([*])c([N+]...,2.0462,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4206,[*]C#Cc1cc(C#Cc2cc(N)c([*])cc2[N+](=O)[O-])c(O...,2.8068,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4207,[*]C#Cc1cc(OCCOCCOC)c(C#Cc2ccc([*])c([N+](=O)[...,2.1853,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
ava_rfe.to_csv("Bandgap_avalon_512rfe.csv", index=False)

print("Files saved.")

Files saved.


### AvalonC

In [7]:
def avalon_count_calculator(bits, monomers):
    # Avalon Count FP #############################################
    start = time.time()

    avac_header = []
    for i in range(bits):
        avac_header.append('AvalonC_'+ str(i+1))

    avalonc = []
    for m in monomers:
        avac = list(pyAvalonTools.GetAvalonCountFP(m, nBits=bits))
        avalonc.append(avac)

    avac_fp = pd.DataFrame(avalonc, columns=avac_header)
    feature_names = np.array(avac_fp.columns)
    selecting = rfe.fit(avac_fp.to_numpy(), polymers['bandgap_chain'].to_numpy())
    selections = feature_names[selecting.get_support()]

    stop = time.time()

    print('runtime:', stop - start, 's')

    avac_final = pd.concat([polymers, avac_fp[selections]], axis=1)
    
    return avac_final

In [8]:
avac_rfe = avalon_count_calculator(4096, monomers)
avac_rfe

runtime: 56.630770683288574 s


Unnamed: 0,SMILES,bandgap_chain,AvalonC_15,AvalonC_16,AvalonC_20,AvalonC_24,AvalonC_28,AvalonC_35,AvalonC_37,AvalonC_40,...,AvalonC_4033,AvalonC_4047,AvalonC_4049,AvalonC_4060,AvalonC_4063,AvalonC_4073,AvalonC_4075,AvalonC_4077,AvalonC_4086,AvalonC_4090
0,[*]C[*],6.8063,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,[*]CC([*])C,6.4609,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,[*]CC([*])CC,6.6228,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,[*]CC([*])CCC,6.7380,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,[*]CC([*])CC(C)C,6.7268,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,[*]CCCCCC[N+](C)(C)CCCCC[N+]([*])(C)C,0.1118,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4205,[*]C#Cc1cc(OCCCCCCCCCCCC)c(C#Cc2ccc([*])c([N+]...,2.0462,0,0,0,0,8,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4206,[*]C#Cc1cc(C#Cc2cc(N)c([*])cc2[N+](=O)[O-])c(O...,2.8068,0,0,0,0,8,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4207,[*]C#Cc1cc(OCCOCCOC)c(C#Cc2ccc([*])c([N+](=O)[...,2.1853,0,0,0,0,8,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
avac_rfe.to_csv("Bandgap_avalonc_512rfe.csv", index=False)

print("Files saved.")

Files saved.


### Layered FP

In [10]:
def layeredfp_calculator(bits, monomers):
    # Layered Fingerprint #########################################################
    start = time.time()

    lay_header = []
    for i in range(bits):
        lay_header.append('Layered_'+ str(i+1))

    layfps = []    
    for m in monomers:
        lay = list(rdmolops.LayeredFingerprint(m, fpSize=bits))
        layfps.append(lay)

    lay_ = pd.DataFrame(layfps, columns = lay_header)
    feature_names = np.array(lay_.columns)
    selecting = rfe.fit(lay_.to_numpy(), polymers['bandgap_chain'].to_numpy())
    selections = feature_names[selecting.get_support()]

    # Since it's just 0 and 1, no need to scale

    stop = time.time()

    print('runtime:', stop - start, 's') 

    lay_final = pd.concat([polymers, lay_[selections]], axis=1)
    
    return lay_final

In [11]:
lay_rfe = layeredfp_calculator(4096, monomers)
lay_rfe

runtime: 84.92422270774841 s


Unnamed: 0,SMILES,bandgap_chain,Layered_27,Layered_29,Layered_31,Layered_36,Layered_42,Layered_43,Layered_46,Layered_74,...,Layered_3963,Layered_3967,Layered_3974,Layered_4009,Layered_4036,Layered_4040,Layered_4042,Layered_4067,Layered_4079,Layered_4085
0,[*]C[*],6.8063,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,[*]CC([*])C,6.4609,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,[*]CC([*])CC,6.6228,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,[*]CC([*])CCC,6.7380,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,[*]CC([*])CC(C)C,6.7268,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,[*]CCCCCC[N+](C)(C)CCCCC[N+]([*])(C)C,0.1118,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1
4205,[*]C#Cc1cc(OCCCCCCCCCCCC)c(C#Cc2ccc([*])c([N+]...,2.0462,0,1,1,0,0,0,0,0,...,1,0,0,1,1,0,0,0,0,1
4206,[*]C#Cc1cc(C#Cc2cc(N)c([*])cc2[N+](=O)[O-])c(O...,2.8068,0,1,1,0,0,0,0,0,...,1,0,0,1,1,0,0,0,0,1
4207,[*]C#Cc1cc(OCCOCCOC)c(C#Cc2ccc([*])c([N+](=O)[...,2.1853,0,1,1,0,0,0,0,0,...,1,0,0,1,1,0,0,0,0,1


In [12]:
lay_rfe.to_csv("Bandgap_layered_512rfe.csv", index=False)

print("Files saved.")

Files saved.


### RDKit FP

In [13]:
def rdkitfp_calculator(bits, monomers):
    # Daylight Fingerprint #########################################################
    start = time.time()

    rdk_header = []
    for i in range(bits):
        rdk_header.append('RDKit_'+ str(i+1))

    rdkfps = []    
    for m in monomers:
        rdk = list(rdmolops.RDKFingerprint(m, fpSize=bits))
        rdkfps.append(rdk)

    RDK_ = pd.DataFrame(rdkfps, columns = rdk_header) 
    feature_names = np.array(RDK_.columns)
    selecting = rfe.fit(RDK_.to_numpy(), polymers['bandgap_chain'].to_numpy())
    selections = feature_names[selecting.get_support()]

    # Since it's just 0 and 1, no need to scale

    stop = time.time()

    print('runtime:', stop - start, 's') 

    rdk_final = pd.concat([polymers, RDK_[selections]], axis=1)

    return rdk_final

In [14]:
rdk_rfe = rdkitfp_calculator(4096, monomers)
rdk_rfe

runtime: 80.3710355758667 s


Unnamed: 0,SMILES,bandgap_chain,RDKit_2,RDKit_9,RDKit_13,RDKit_17,RDKit_20,RDKit_28,RDKit_34,RDKit_35,...,RDKit_4035,RDKit_4050,RDKit_4051,RDKit_4070,RDKit_4080,RDKit_4083,RDKit_4084,RDKit_4087,RDKit_4091,RDKit_4095
0,[*]C[*],6.8063,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,[*]CC([*])C,6.4609,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,[*]CC([*])CC,6.6228,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,[*]CC([*])CCC,6.7380,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,[*]CC([*])CC(C)C,6.7268,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,[*]CCCCCC[N+](C)(C)CCCCC[N+]([*])(C)C,0.1118,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4205,[*]C#Cc1cc(OCCCCCCCCCCCC)c(C#Cc2ccc([*])c([N+]...,2.0462,0,1,0,0,0,0,1,0,...,1,0,0,1,0,0,0,0,0,0
4206,[*]C#Cc1cc(C#Cc2cc(N)c([*])cc2[N+](=O)[O-])c(O...,2.8068,0,1,0,0,0,0,1,0,...,1,1,1,1,0,0,0,1,0,0
4207,[*]C#Cc1cc(OCCOCCOC)c(C#Cc2ccc([*])c([N+](=O)[...,2.1853,0,1,0,0,0,0,1,0,...,1,0,1,1,0,0,0,0,0,0


In [15]:
rdk_rfe.to_csv("Bandgap_rdkitfp_512rfe.csv", index=False)

print("Files saved.")

Files saved.
