In [None]:
import pandas as pd                                              
import numpy as np                                       
import matplotlib.pyplot as plt                       
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.Chem import Lipinski
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem import MACCSkeys
from rdkit.Avalon import pyAvalonTools
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, median_absolute_error

### QCD

In [None]:
data_qcd = pd.read_excel('hetero_qcd.xlsx')
x = data_qcd.drop(columns='IE (%)')
y = data_qcd['IE (%)']
scaler = RobustScaler()
scaler.fit(x)
x = scaler.fit_transform(x)
best_r2_train_qcd = -float('inf') 
best_r2_test_qcd = -float('inf')
best_rmse_train_qcd = float('inf') 
best_rmse_test_qcd  = float('inf')  
rmse_train_values_qcd = []
rmse_test_values_qcd = []

kfold = KFold(n_splits=10, shuffle=True, random_state=42)
for train, test in kfold.split(x, y):
    x_train, x_test = x[train], x[test]
    y_train, y_test = y[train], y[test]
  
    gbr = GradientBoostingRegressor()
    gbr.fit(x_train, y_train)

    y_pred_train_qcd = gbr.predict(x_train)
    y_pred_test_qcd = gbr.predict(x_test)
    
    r2_train_qcd = r2_score(y_train, y_pred_train_qcd)
    r2_test_qcd = r2_score(y_test, y_pred_test_qcd)
    
    rmse_train_qcd = mean_squared_error(y_train, y_pred_train_qcd, squared=False)
    rmse_test_qcd = mean_squared_error(y_test, y_pred_test_qcd, squared=False)

    rmse_train_values_qcd.append(rmse_train_qcd)
    rmse_test_values_qcd.append(rmse_test_qcd)
    
    if r2_train_qcd > best_r2_train_qcd:
        best_r2_train_qcd = r2_train_qcd
    if r2_test_qcd > best_r2_test_qcd:
        best_r2_test_qcd = r2_test_qcd
        
    if rmse_train_qcd < best_rmse_train_qcd:
        best_rmse_train_qcd = rmse_train_qcd
    if rmse_test_qcd < best_rmse_test_qcd:
        best_rmse_test_qcd = rmse_test_qcd

avg_r2_train_qcd = np.mean(r2_train_qcd)
avg_rmse_train_qcd = np.mean(rmse_train_qcd)
avg_r2_test_qcd = np.mean(r2_test_qcd)
avg_rmse_test_qcd = np.mean(rmse_test_qcd)

### FINGERPRINTS

In [None]:
data_smile3 = pd.read_excel("hetero_smile.xlsx")

fingerprints_maccs = []
fingerprints_avalon = []
fingerprints_morgan = []
fingerprints_topology = []
targets = []

for index, entry in data_smile3.iterrows():
    molecule = Chem.MolFromSmiles(entry["SMILES"])
    if molecule is not None:
        fingerprint_mac = MACCSkeys.GenMACCSKeys(molecule)
        fingerprints_maccs.append(fingerprint_mac)
        fingerprint_av = pyAvalonTools.GetAvalonFP(molecule)
        fingerprints_avalon.append(fingerprint_av)
        fingerprint_mor = AllChem.GetMorganFingerprintAsBitVect(molecule, 2, nBits=2048)
        fingerprints_morgan.append(fingerprint_mor)
        fingerprint_top = AllChem.GetHashedTopologicalTorsionFingerprintAsBitVect(molecule)
        fingerprints_topology.append(fingerprint_top)
        targets.append(entry["IE (%)"])

In [None]:
scaler = RobustScaler()
fingerprints_maccs = scaler.fit_transform(fingerprints_maccs)
fingerprints_avalon = scaler.fit_transform(fingerprints_avalon)
fingerprints_morgan = scaler.fit_transform(fingerprints_morgan)
fingerprints_topology = scaler.fit_transform(fingerprints_topology)

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

best_r2_train_mac = -float('inf')
best_r2_train_av = -float('inf')
best_r2_train_mor = -float('inf') 
best_r2_train_top = -float('inf')
best_r2_test_mac = -float('inf')
best_r2_test_av = -float('inf')
best_r2_test_mor = -float('inf')
best_r2_test_top = -float('inf')
best_rmse_train_mac = float('inf') 
best_rmse_train_av = float('inf') 
best_rmse_train_mor = float('inf') 
best_rmse_train_top = float('inf') 
best_rmse_test_mac  = float('inf')  
best_rmse_test_av  = float('inf')  
best_rmse_test_mor  = float('inf')  
best_rmse_test_top  = float('inf')  

rmse_train_mac_values = []
rmse_train_av_values = []
rmse_train_mor_values = []
rmse_train_top_values = []
rmse_test_mac_values = []
rmse_test_av_values = []
rmse_test_mor_values = []
rmse_test_top_values = []

for train, test in kfold.split(fingerprints_maccs, targets):
    x_train_mac, x_test_mac = fingerprints_maccs[train], fingerprints_maccs[test]
    y_train_mac, y_test_mac = [targets[i] for i in train], [targets[i] for i in test]

for train, test in kfold.split(fingerprints_avalon, targets):
    x_train_av, x_test_av = fingerprints_avalon[train], fingerprints_avalon[test]
    y_train_av, y_test_av = [targets[i] for i in train], [targets[i] for i in test]

for train, test in kfold.split(fingerprints_morgan, targets):
    x_train_mor, x_test_mor = fingerprints_morgan[train], fingerprints_morgan[test]
    y_train_mor, y_test_mor = [targets[i] for i in train], [targets[i] for i in test]

for train, test in kfold.split(fingerprints_topology, targets):
    x_train_top, x_test_top = fingerprints_topology[train], fingerprints_topology[test]
    y_train_top, y_test_top = [targets[i] for i in train], [targets[i] for i in test]

    model_mac = GradientBoostingRegressor(n_estimators=100, random_state=42)
    model_av = GradientBoostingRegressor(n_estimators=100, random_state=42)
    model_mor = GradientBoostingRegressor(n_estimators=100, random_state=42)
    model_top = GradientBoostingRegressor(n_estimators=100, random_state=42)
       
    model_mac.fit(x_train_mac, y_train_mac)
    model_av.fit(x_train_av, y_train_av)
    model_mor.fit(x_train_mor, y_train_mor)
    model_top.fit(x_train_top, y_train_top)
   
    y_pred_train_mac = model_mac.predict(x_train_mac)
    y_pred_train_av = model_av.predict(x_train_av)
    y_pred_train_mor = model_mor.predict(x_train_mor)
    y_pred_train_top = model_top.predict(x_train_top)       
    y_pred_test_mac = model_mac.predict(x_test_mac)
    y_pred_test_av = model_av.predict(x_test_av)
    y_pred_test_mor = model_mor.predict(x_test_mor)
    y_pred_test_top = model_top.predict(x_test_top)

    r2_train_mac = r2_score(y_train_mac, y_pred_train_mac)
    r2_train_av = r2_score(y_train_av, y_pred_train_av)
    r2_train_mor = r2_score(y_train_mor, y_pred_train_mor)
    r2_train_top = r2_score(y_train_top, y_pred_train_top)
    r2_test_mac = r2_score(y_test_mac, y_pred_test_mac)
    r2_test_av = r2_score(y_test_av, y_pred_test_av)
    r2_test_mor = r2_score(y_test_mor, y_pred_test_mor)
    r2_test_top = r2_score(y_test_top, y_pred_test_top)

    rmse_train_mac = mean_squared_error(y_train_mac, y_pred_train_mac, squared=False)
    rmse_train_av = mean_squared_error(y_train_av, y_pred_train_av, squared=False)
    rmse_train_mor = mean_squared_error(y_train_mor, y_pred_train_mor, squared=False)
    rmse_train_top = mean_squared_error(y_train_top, y_pred_train_top, squared=False)
    rmse_test_mac = mean_squared_error(y_test_mac, y_pred_test_mac, squared=False)
    rmse_test_av = mean_squared_error(y_test_av, y_pred_test_av, squared=False)
    rmse_test_mor = mean_squared_error(y_test_mor, y_pred_test_mor, squared=False)
    rmse_test_top = mean_squared_error(y_test_top, y_pred_test_top, squared=False)
                  
    rmse_train_mac_values.append(rmse_train_mac)
    rmse_train_av_values.append(rmse_train_av)
    rmse_train_mor_values.append(rmse_train_mor)
    rmse_train_top_values.append(rmse_train_top)
    rmse_test_mac_values.append(rmse_test_mac)
    rmse_test_av_values.append(rmse_test_av)
    rmse_test_mor_values.append(rmse_test_mor)
    rmse_test_top_values.append(rmse_test_top)
    
    if r2_train_mac > best_r2_train_mac:
        best_r2_train_mac = r2_train_mac
    if r2_train_av > best_r2_train_av:
        best_r2_train_av = r2_train_av        
    if r2_train_mor > best_r2_train_mor:
        best_r2_train_mor = r2_train_mor        
    if r2_train_top > best_r2_train_top:
        best_r2_train_top = r2_train_top        
    if r2_test_mac > best_r2_test_mac:
        best_r2_test_mac = r2_test_mac
    if r2_test_av > best_r2_test_av:
        best_r2_test_av = r2_test_av        
    if r2_test_mor > best_r2_test_mor:
        best_r2_test_mor = r2_test_mor
    if r2_test_top > best_r2_test_top:
        best_r2_test_top = r2_test_top        
        
    if rmse_train_mac < best_rmse_train_mac:
        best_rmse_train_mac = rmse_train_mac
    if rmse_train_av < best_rmse_train_av:
        best_rmse_train_av = rmse_train_av        
    if rmse_train_mor < best_rmse_train_mor:
        best_rmse_train_mor = rmse_train_mor        
    if rmse_train_top < best_rmse_train_top:
        best_rmse_train_top = rmse_train_top                
    if rmse_test_mac < best_rmse_test_mac:
        best_rmse_test_mac = rmse_test_mac
    if rmse_test_av < best_rmse_test_av:
        best_rmse_test_av = rmse_test_av        
    if rmse_test_mor < best_rmse_test_mor:
        best_rmse_test_mor = rmse_test_mor        
    if rmse_test_top < best_rmse_test_top:
        best_rmse_test_top = rmse_test_top        

avg_r2_mac1 = np.mean(r2_train_mac)
avg_r2_av1 = np.mean(r2_train_av)
avg_r2_mor1 = np.mean(r2_train_mor)
avg_r2_top1 = np.mean(r2_train_top)
avg_rmse_mac1 = np.mean(rmse_train_mac)
avg_rmse_av1 = np.mean(rmse_train_av)
avg_rmse_mor1 = np.mean(rmse_train_mor)
avg_rmse_top1 = np.mean(rmse_train_top)
avg_r2_mac2 = np.mean(r2_test_mac)
avg_r2_av2 = np.mean(r2_test_av)
avg_r2_mor2 = np.mean(r2_test_mor)
avg_r2_top2 = np.mean(r2_test_top)
avg_rmse_mac2 = np.mean(rmse_test_mac)
avg_rmse_av2 = np.mean(rmse_test_av)
avg_rmse_mor2 = np.mean(rmse_test_mor)
avg_rmse_top2 = np.mean(rmse_test_top)

In [None]:
print("R2 QCD     :", r2_score(y_train, y_pred_train_qcd))
print("R2 SMILE   :", best_r2_train_sml)
print("R2 MAC     :", best_r2_train_mac)
print("R2 AV      :", best_r2_train_av)
print("R2 MOR     :", best_r2_train_mor)
print("R2 TOP     :", best_r2_train_top)
print("RMSE QCD   :", best_rmse_train_qcd)
print("RMSE SMILE :", best_rmse_train_sml)
print("RMSE MAC   :", best_rmse_train_mac)
print("RMSE AV    :", best_rmse_train_av)
print("RMSE MOR   :", best_rmse_train_mor)
print("RMSE TOP   :", best_rmse_train_top)
print("R2 QCD     :", r2_score(y_test, y_pred_test_qcd))
print("R2 SMILE   :", best_r2_test_sml)
print("R2 MAC     :", best_r2_test_mac)
print("R2 AV      :", best_r2_test_av)
print("R2 MOR     :", best_r2_test_mor)
print("R2 TOP     :", best_r2_test_top)
print("RMSE QCD   :", best_rmse_test_qcd)
print("RMSE SMILE :", best_rmse_test_sml)
print("RMSE MAC   :", best_rmse_test_mac)
print("RMSE AV    :", best_rmse_test_av)
print("RMSE MOR   :", best_rmse_test_mor)
print("RMSE TOP   :", best_rmse_test_top)
print('=========================================================')
print("R2 QCD     :", avg_r2_train_qcd)
print("R2 SMILE   :", avg_r2_train_sml)
print("R2 MAC     :", avg_r2_mac1)
print("R2 AV      :", avg_r2_av1)
print("R2 MOR     :", avg_r2_mor1)
print("R2 TOP     :", avg_r2_top1)
print("RMSE QCD   :", avg_rmse_train_qcd)
print("RMSE SMILE :", avg_rmse_train_sml)
print("RMSE MAC   :", avg_rmse_mac1)
print("RMSE AV    :", avg_rmse_av1)
print("RMSE MOR   :", avg_rmse_mor1)
print("RMSE TOP   :", avg_rmse_top1)
print("R2 QCD     :", avg_r2_test_qcd)
print("R2 SMILE   :", avg_r2_test_sml)
print("R2 MAC     :", avg_r2_mac2)
print("R2 AV      :", avg_r2_av2)
print("R2 MOR     :", avg_r2_mor2)
print("R2 TOP     :", avg_r2_top2)
print("RMSE QCD   :", avg_rmse_test_qcd)
print("RMSE SMILE :", avg_rmse_test_sml)
print("RMSE MAC   :", avg_rmse_mac2)
print("RMSE AV    :", avg_rmse_av2)
print("RMSE MOR   :", avg_rmse_mor2)
print("RMSE TOP   :", avg_rmse_top2)