In [1]:
import pandas as pd
from adapt.instance_based import TrAdaBoostR2
from sklearn.ensemble import HistGradientBoostingRegressor as hgb
from sklearn.metrics import mean_absolute_error

In [2]:
#Source=S1
source = 'S1'

data_s = pd.read_csv(f'data/DFT_FE/data_s_{source}.csv')
data_s = data_s.drop(columns=['Name', 'ID'])

data_t_lab = pd.read_csv('data/DFT_FE/data_t_lab_iso.csv')
data_t_lab = data_t_lab.drop(columns=['Name', 'ID'])
data_t_unlab = pd.read_csv('data/DFT_FE/data_t_unlab_iso.csv')
ID_t_unlab = data_t_unlab[['Name', 'ID']]
data_t_unlab = data_t_unlab.drop(columns=['Name', 'ID'])

ys = pd.DataFrame(data_s['Yield'],columns=['Yield'])
Xs = data_s.drop(columns=['Yield'])
yt_lab = pd.DataFrame(data_t_lab['Yield'],columns=['Yield'])
Xt_lab = data_t_lab.drop(columns=['Yield', 'Reaction_2+2'])
Xt_unlab = data_t_unlab.drop(columns=['Reaction_2+2'])

model = TrAdaBoostR2(hgb(), n_estimators=23, Xt=Xt_lab, yt=yt_lab, random_state=0, verbose=0)
model.fit(Xs, ys)
y_pred = model.predict(Xt_unlab)
y_pred = pd.DataFrame(y_pred, columns=['Pred_yield'])
prediction = pd.concat([ID_t_unlab, y_pred], axis=1, join='inner')
prediction = prediction.sort_values('Pred_yield', ascending=False)
prediction.to_csv(f'result/DFT_FE/prediction_iso_{source}.csv', index=False)
prediction.head()

Unnamed: 0,Name,ID,Pred_yield
28,2PhDPABP,OPS36,89.693562
5,4DPAIPN,OPS7,81.481278
19,1ClDPATRZ,OPS24,76.089545
20,1BrDPATRZ,OPS25,75.077163
8,4ClDPAIPN,OPS11,72.778674


In [3]:
exp = {'Exp_yield': [77, 92, 16, 31, 96]}
exp = pd.DataFrame(exp)
pred = pd.read_csv(f'result/DFT_FE/prediction_iso_{source}.csv')
df = pd.concat([pred, exp], axis=1, join='inner')
mae = mean_absolute_error(df['Exp_yield'], df['Pred_yield'].round().astype(int))
print(f'MAE when using {source}:', round(mae, 1))
df

MAE when using S1: 30.2


Unnamed: 0,Name,ID,Pred_yield,Exp_yield
0,2PhDPABP,OPS36,89.693562,77
1,4DPAIPN,OPS7,81.481278,92
2,1ClDPATRZ,OPS24,76.089545,16
3,1BrDPATRZ,OPS25,75.077163,31
4,4ClDPAIPN,OPS11,72.778674,96


In [4]:
#Source=S6
source = 'S6'

data_s = pd.read_csv(f'data/DFT_FE/data_s_{source}.csv')
data_s = data_s.drop(columns=['Name', 'ID'])

data_t_lab = pd.read_csv('data/DFT_FE/data_t_lab_iso.csv')
data_t_lab = data_t_lab.drop(columns=['Name', 'ID'])
data_t_unlab = pd.read_csv('data/DFT_FE/data_t_unlab_iso.csv')
ID_t_unlab = data_t_unlab[['Name', 'ID']]
data_t_unlab = data_t_unlab.drop(columns=['Name', 'ID'])

ys = pd.DataFrame(data_s['Yield'],columns=['Yield'])
Xs = data_s.drop(columns=['Yield'])
yt_lab = pd.DataFrame(data_t_lab['Yield'],columns=['Yield'])
Xt_lab = data_t_lab.drop(columns=['Yield', 'Reaction_CO_Cl'])
Xt_unlab = data_t_unlab.drop(columns=['Reaction_CO_Cl'])

model = TrAdaBoostR2(hgb(), n_estimators=23, Xt=Xt_lab, yt=yt_lab, random_state=0, verbose=0)
model.fit(Xs, ys)
y_pred = model.predict(Xt_unlab)
y_pred = pd.DataFrame(y_pred, columns=['Pred_yield'])
prediction = pd.concat([ID_t_unlab, y_pred], axis=1, join='inner')
prediction = prediction.sort_values('Pred_yield', ascending=False)
prediction.to_csv(f'result/DFT_FE/prediction_iso_{source}.csv', index=False)
prediction.head()

Unnamed: 0,Name,ID,Pred_yield
8,4ClDPAIPN,OPS11,85.315402
25,2tBuDPABP,OPS32,80.304264
9,4BrDPAIPN,OPS12,78.993539
5,4DPAIPN,OPS7,78.55962
28,2PhDPABP,OPS36,76.265642


In [5]:
exp = {'Exp_yield': [96, 71, 95, 92, 77]}
exp = pd.DataFrame(exp)
pred = pd.read_csv(f'result/DFT_FE/prediction_iso_{source}.csv')
df = pd.concat([pred, exp], axis=1, join='inner')
mae = mean_absolute_error(df['Exp_yield'], df['Pred_yield'].round().astype(int))
print(f'MAE when using {source}:', round(mae, 1))
df

MAE when using S6: 10.0


Unnamed: 0,Name,ID,Pred_yield,Exp_yield
0,4ClDPAIPN,OPS11,85.315402,96
1,2tBuDPABP,OPS32,80.304264,71
2,4BrDPAIPN,OPS12,78.993539,95
3,4DPAIPN,OPS7,78.55962,92
4,2PhDPABP,OPS36,76.265642,77
