In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from adapt.instance_based import TrAdaBoostR2
from sklearn.ensemble import RandomForestRegressor as rf
from sklearn import metrics
from scipy.stats import pearsonr
from scipy.stats import kendalltau

In [2]:
All_data_names = ['Reaction_Si']

train_size_list = [15, 20, 30]

In [3]:
for target in All_data_names:
    yields = pd.read_csv(f'data/{target}/Yield_list.csv')
    data_s_woyields = pd.read_csv(f'data/{target}/data_s_woyields.csv')
    data_t = pd.read_csv(f'data/{target}/data_t.csv')
    yt = pd.DataFrame(data_t['Yield'],columns=['Yield'])
    Xt = data_t.drop(columns=['Name', 'ID', 'Yield'])

    for train_size in train_size_list:
        r2_train = []
        r2_test = []
        pearson_train = []
        pearson_test = []
        kendalltau_train = []
        kendalltau_test = []
        selected_SD = []
        for i in range(100):
            yield_lab, yield_unlab = train_test_split(yields, train_size=train_size, random_state=i)
            yield_lab_ID = pd.DataFrame(yield_lab['ID'],columns=['ID'])
            yield_lab_values = yield_lab.drop(columns=['Name', 'ID'])
            
            corr_matrix = yield_lab_values.corr()
            corr = corr_matrix.loc[[target], :]
            corr = np.array(corr)
            columns = ['Reaction_CO_1.5h', 'Reaction_CO_biphenyl', 'Reaction_CO_ortho', 'Reaction_CO_Cl',
                       'Reaction_CS', 'Reaction_CN', 'Reaction_2+2',
                       'Reaction_CF3', 'Reaction_CH2CF3', 'Reaction_CH2F', 'Reaction_Cy', 'Reaction_SCF3', 'Reaction_Si']
            corr = pd.DataFrame(data=corr, columns=columns)
            top_3 = corr.drop(columns=[target]).T[0].nlargest(3).index.tolist()
            selected_SD.append(top_3)
        
            Yield = []
            for dataset in top_3:
                reaction_yield = pd.read_csv(f'data/data_yield/Yield_{dataset}.csv')
                Yield.append(reaction_yield)
        
            concat_yield = pd.concat(Yield, axis=0, ignore_index=True)
            data_s = pd.concat([data_s_woyields, concat_yield], axis=1, join='inner')
            ys = pd.DataFrame(data_s['Yield'],columns=['Yield'])
            Xs = data_s.drop(columns=['Name', 'ID', 'Yield'])
            
            Xt_lab, Xt_unlab, yt_lab, yt_unlab = train_test_split(Xt, yt, train_size=train_size, random_state=i)
            model = TrAdaBoostR2(rf(), n_estimators=5, Xt=Xt_lab, yt=yt_lab, random_state=0, verbose=0)
            model.fit(Xs, ys)
            y_pred1 = model.predict(Xt_lab)
            y_pred2 = model.predict(Xt_unlab)
            r2_train.append(metrics.r2_score(yt_lab, y_pred1))
            r2_test.append(metrics.r2_score(yt_unlab, y_pred2))
            pearson_train.append(pearsonr(yt_lab.values.ravel(), y_pred1.ravel())[0])
            pearson_test.append(pearsonr(yt_unlab.values.ravel(), y_pred2.ravel())[0])
            kendalltau_train.append(kendalltau(yt_lab.values.ravel(), y_pred1.ravel())[0])
            kendalltau_test.append(kendalltau(yt_unlab.values.ravel(), y_pred2.ravel())[0])
        
        selectedSD = pd.DataFrame(selected_SD, columns=['SD1', 'SD2', 'SD3'])
        result_train = pd.DataFrame(r2_train, columns=['r2_train'])
        result_test = pd.DataFrame(r2_test, columns=['r2_test'])
        result2_train = pd.DataFrame(pearson_train, columns=['pr_train'])
        result2_test = pd.DataFrame(pearson_test, columns=['pr_test'])
        result3_train = pd.DataFrame(kendalltau_train, columns=['kt_train'])
        result3_test = pd.DataFrame(kendalltau_test, columns=['kt_test'])
        result = pd.concat([selectedSD, result_train, result_test, result2_train, result2_test, result3_train, result3_test], axis=1, join='inner')
        result.to_csv(f'../../results_reg/TrAB/train_size_{train_size}/result_{target}.csv')