In [1]:
import pandas as pd
from adapt.instance_based import TrAdaBoostR2
from sklearn.ensemble import HistGradientBoostingRegressor as hgb
from sklearn.metrics import mean_absolute_error

In [2]:
data_A = pd.read_csv('data/yields_iso.csv')
data_A #Used training data

Unnamed: 0,Name,ID,Yield_CO_1.5h,Yield_CO_7.5h,Yield_CO_biphenyl,Yield_CO_ortho,Yield_CO_Cl,Yield_CS,Yield_CN,Yield_2+2,Yield_iso
0,4OMeCzIPN,OPS5,33,87,52,31,24,0,33,0,4
1,1CzTRZ,OPS20,16,79,40,29,33,0,7,1,1
2,1tBuDPATRZ,OPS23,16,73,28,20,30,97,82,39,56
3,1PhDPATRZ,OPS27,38,79,45,26,34,97,85,26,41
4,2DPABP,OPS31,1,39,14,9,27,9,99,47,59
5,2ClDPABP,OPS34,1,34,13,13,22,5,99,47,49
6,2DPAAQ,OPS59,0,0,0,0,0,0,0,0,0
7,C_498,OPS83,0,2,2,0,0,5,18,0,5


In [3]:
#Source=S1
source = 'S1'
#source = 'S6'

data_s = pd.read_csv(f'data/data_s_{source}.csv')
data_s = data_s.drop(columns=['Name', 'ID'])

data_t_lab = pd.read_csv('data/data_t_lab_iso.csv')
data_t_lab = data_t_lab.drop(columns=['Name', 'ID'])
data_t_unlab = pd.read_csv('data/data_t_unlab_iso.csv')
ID_t_unlab = data_t_unlab[['Name', 'ID']]
data_t_unlab = data_t_unlab.drop(columns=['Name', 'ID'])

ys = pd.DataFrame(data_s['Yield'],columns=['Yield'])
Xs = data_s.drop(columns=['Yield'])
yt_lab = pd.DataFrame(data_t_lab['Yield'],columns=['Yield'])
Xt_lab = data_t_lab.drop(columns=['Yield', 'Reaction_2+2'])
Xt_unlab = data_t_unlab.drop(columns=['Reaction_2+2'])

model = TrAdaBoostR2(hgb(), n_estimators=23, Xt=Xt_lab, yt=yt_lab, random_state=0, verbose=0)
model.fit(Xs, ys)
y_pred = model.predict(Xt_unlab)
y_pred = pd.DataFrame(y_pred, columns=['Pred_yield'])
prediction = pd.concat([ID_t_unlab, y_pred], axis=1, join='inner')
prediction = prediction.sort_values('Pred_yield', ascending=False)
prediction.to_csv(f'result/prediction_iso_{source}.csv', index=False)
prediction.head()

Unnamed: 0,Name,ID,Pred_yield
10,4BrDPAIPN,OPS12,85.881857
9,4ClDPAIPN,OPS11,85.726207
5,4DPAIPN,OPS7,79.005388
8,4FDPAIPN,OPS10,74.370661
0,4CzIPN,OPS1,69.886234


In [4]:
exp = {'Exp_yield': [95, 96, 92, 95, 91]}
exp = pd.DataFrame(exp)
pred = pd.read_csv(f'result/prediction_iso_{source}.csv')
df = pd.concat([pred, exp], axis=1, join='inner')
mae = mean_absolute_error(df['Exp_yield'], df['Pred_yield'].round().astype(int))
print(f'MAE when using {source}:', round(mae, 1))
df

MAE when using S1: 14.8


Unnamed: 0,Name,ID,Pred_yield,Exp_yield
0,4BrDPAIPN,OPS12,85.881857,95
1,4ClDPAIPN,OPS11,85.726207,96
2,4DPAIPN,OPS7,79.005388,92
3,4FDPAIPN,OPS10,74.370661,95
4,4CzIPN,OPS1,69.886234,91


In [5]:
#Source=S6
#source = 'S1'
source = 'S6'

data_s = pd.read_csv(f'data/data_s_{source}.csv')
data_s = data_s.drop(columns=['Name', 'ID'])

data_t_lab = pd.read_csv('data/data_t_lab_iso.csv')
data_t_lab = data_t_lab.drop(columns=['Name', 'ID'])
data_t_unlab = pd.read_csv('data/data_t_unlab_iso.csv')
ID_t_unlab = data_t_unlab[['Name', 'ID']]
data_t_unlab = data_t_unlab.drop(columns=['Name', 'ID'])

ys = pd.DataFrame(data_s['Yield'],columns=['Yield'])
Xs = data_s.drop(columns=['Yield'])
yt_lab = pd.DataFrame(data_t_lab['Yield'],columns=['Yield'])
Xt_lab = data_t_lab.drop(columns=['Yield', 'Reaction_CO_Cl'])
Xt_unlab = data_t_unlab.drop(columns=['Reaction_CO_Cl'])

model = TrAdaBoostR2(hgb(), n_estimators=23, Xt=Xt_lab, yt=yt_lab, random_state=0, verbose=0)
model.fit(Xs, ys)
y_pred = model.predict(Xt_unlab)
y_pred = pd.DataFrame(y_pred, columns=['Pred_yield'])
prediction = pd.concat([ID_t_unlab, y_pred], axis=1, join='inner')
prediction = prediction.sort_values('Pred_yield', ascending=False)
prediction.to_csv(f'result/prediction_iso_{source}.csv', index=False)
prediction.head()

Unnamed: 0,Name,ID,Pred_yield
9,4ClDPAIPN,OPS11,88.322381
10,4BrDPAIPN,OPS12,87.460791
0,4CzIPN,OPS1,81.781706
8,4FDPAIPN,OPS10,78.947952
59,5CzBN,OPS67,77.237016


In [6]:
exp = {'Exp_yield': [96, 95, 91, 95, 84]}
exp = pd.DataFrame(exp)
pred = pd.read_csv(f'result/prediction_iso_{source}.csv')
df = pd.concat([pred, exp], axis=1, join='inner')
mae = mean_absolute_error(df['Exp_yield'], df['Pred_yield'].round().astype(int))
print(f'MAE when using {source}:', round(mae, 1))
df

MAE when using S6: 9.6


Unnamed: 0,Name,ID,Pred_yield,Exp_yield
0,4ClDPAIPN,OPS11,88.322381,96
1,4BrDPAIPN,OPS12,87.460791,95
2,4CzIPN,OPS1,81.781706,91
3,4FDPAIPN,OPS10,78.947952,95
4,5CzBN,OPS67,77.237016,84
