In [1]:
import pandas as pd
from adapt.instance_based import TrAdaBoostR2
from sklearn.ensemble import HistGradientBoostingRegressor as hgb
from sklearn.metrics import mean_absolute_error

In [2]:
#Source=S1
source = 'S1'

data_s = pd.read_csv(f'data/DFT/data_s_{source}.csv')
data_s = data_s.drop(columns=['Name', 'ID'])

data_t_lab = pd.read_csv('data/DFT/data_t_lab_iso.csv')
data_t_lab = data_t_lab.drop(columns=['Name', 'ID'])
data_t_unlab = pd.read_csv('data/DFT/data_t_unlab_iso.csv')
ID_t_unlab = data_t_unlab[['Name', 'ID']]
data_t_unlab = data_t_unlab.drop(columns=['Name', 'ID'])

ys = pd.DataFrame(data_s['Yield'],columns=['Yield'])
Xs = data_s.drop(columns=['Yield'])
yt_lab = pd.DataFrame(data_t_lab['Yield'],columns=['Yield'])
Xt_lab = data_t_lab.drop(columns=['Yield', 'Reaction_2+2'])
Xt_unlab = data_t_unlab.drop(columns=['Reaction_2+2'])

model = TrAdaBoostR2(hgb(), n_estimators=23, Xt=Xt_lab, yt=yt_lab, random_state=0, verbose=0)
model.fit(Xs, ys)
y_pred = model.predict(Xt_unlab)
y_pred = pd.DataFrame(y_pred, columns=['Pred_yield'])
prediction = pd.concat([ID_t_unlab, y_pred], axis=1, join='inner')
prediction = prediction.sort_values('Pred_yield', ascending=False)
prediction.to_csv(f'result/DFT/prediction_iso_{source}.csv', index=False)
prediction.head()

Unnamed: 0,Name,ID,Pred_yield
8,4ClDPAIPN,OPS11,77.350623
25,2tBuDPABP,OPS32,77.269112
9,4BrDPAIPN,OPS12,76.735472
5,4DPAIPN,OPS7,75.340203
7,4FDPAIPN,OPS10,72.968465


In [3]:
exp = {'Exp_yield': [96, 71, 95, 92, 95]}
exp = pd.DataFrame(exp)
pred = pd.read_csv(f'result/DFT/prediction_iso_{source}.csv')
df = pd.concat([pred, exp], axis=1, join='inner')
mae = mean_absolute_error(df['Exp_yield'], df['Pred_yield'])
print(f'MAE when using {source}:', round(mae, 1))
df

MAE when using S1: 16.4


Unnamed: 0,Name,ID,Pred_yield,Exp_yield
0,4ClDPAIPN,OPS11,77.350623,96
1,2tBuDPABP,OPS32,77.269112,71
2,4BrDPAIPN,OPS12,76.735472,95
3,4DPAIPN,OPS7,75.340203,92
4,4FDPAIPN,OPS10,72.968465,95


In [4]:
#Source=S6
source = 'S6'

data_s = pd.read_csv(f'data/DFT/data_s_{source}.csv')
data_s = data_s.drop(columns=['Name', 'ID'])

data_t_lab = pd.read_csv('data/DFT/data_t_lab_iso.csv')
data_t_lab = data_t_lab.drop(columns=['Name', 'ID'])
data_t_unlab = pd.read_csv('data/DFT/data_t_unlab_iso.csv')
ID_t_unlab = data_t_unlab[['Name', 'ID']]
data_t_unlab = data_t_unlab.drop(columns=['Name', 'ID'])

ys = pd.DataFrame(data_s['Yield'],columns=['Yield'])
Xs = data_s.drop(columns=['Yield'])
yt_lab = pd.DataFrame(data_t_lab['Yield'],columns=['Yield'])
Xt_lab = data_t_lab.drop(columns=['Yield', 'Reaction_CO_Cl'])
Xt_unlab = data_t_unlab.drop(columns=['Reaction_CO_Cl'])

model = TrAdaBoostR2(hgb(), n_estimators=23, Xt=Xt_lab, yt=yt_lab, random_state=0, verbose=0)
model.fit(Xs, ys)
y_pred = model.predict(Xt_unlab)
y_pred = pd.DataFrame(y_pred, columns=['Pred_yield'])
prediction = pd.concat([ID_t_unlab, y_pred], axis=1, join='inner')
prediction = prediction.sort_values('Pred_yield', ascending=False)
prediction.to_csv(f'result/DFT/prediction_iso_{source}.csv', index=False)
prediction.head()

Unnamed: 0,Name,ID,Pred_yield
8,4ClDPAIPN,OPS11,89.187209
5,4DPAIPN,OPS7,88.439041
9,4BrDPAIPN,OPS12,87.589469
0,4CzIPN,OPS1,87.094989
28,2PhDPABP,OPS36,81.459774


In [5]:
exp = {'Exp_yield': [96, 92, 95, 91, 77]}
exp = pd.DataFrame(exp)
pred = pd.read_csv(f'result/DFT/prediction_iso_{source}.csv')
df = pd.concat([pred, exp], axis=1, join='inner')
mae = mean_absolute_error(df['Exp_yield'], df['Pred_yield'])
print(f'MAE when using {source}:', round(mae, 1))
df

MAE when using S6: 5.2


Unnamed: 0,Name,ID,Pred_yield,Exp_yield
0,4ClDPAIPN,OPS11,89.187209,96
1,4DPAIPN,OPS7,88.439041,92
2,4BrDPAIPN,OPS12,87.589469,95
3,4CzIPN,OPS1,87.094989,91
4,2PhDPABP,OPS36,81.459774,77
