In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from adapt.instance_based import TrAdaBoostR2
from lightgbm import LGBMRegressor as lgbm
from sklearn import metrics

In [2]:
size = 70

for seed in range(10):
    print('Seed:', seed)
    data_coe = pd.read_csv('data/data_s_S1FE_COe.csv')
    data_coe_used, data_coe_notused = train_test_split(data_coe, train_size=size, random_state=seed)
    data_cs = pd.read_csv('data/data_s_S1FE_CS.csv')
    data_cs_used, data_cs_notused = train_test_split(data_cs, train_size=size, random_state=seed)
    data_cn = pd.read_csv('data/data_s_S1FE_CN.csv')
    data_cn_used, data_cn_notused = train_test_split(data_cn, train_size=size, random_state=seed)
    data_notused_id = data_coe_notused['ID']
    data_used_id = data_coe_used['ID']
    data_notused_id.to_csv(f'result/data_notused/data notused_{seed}.csv', index=False)
    data_s = pd.concat([data_coe_used, data_cs_used, data_cn_used])
    Xs = data_s.drop(columns=['Name', 'ID', 'Yield'])
    ys = pd.DataFrame(data_s['Yield'],columns=['Yield'])

    data_t = pd.read_csv(f'data/data_tFE.csv')

    r2_unlab = []
    for i in range(100):
        data_lab, data_unlab = train_test_split(data_t, test_size=0.5, random_state=i)
        Xt_lab = data_lab.drop(columns=['Name', 'ID', 'Yield'])
        Xt_unlab = data_unlab.drop(columns=['Name', 'ID', 'Yield'])
        yt_lab = pd.DataFrame(data_lab['Yield'],columns=['Yield'])
        yt_unlab = pd.DataFrame(data_unlab['Yield'],columns=['Yield'])
        unlab_id = pd.DataFrame(data_unlab['ID'],columns=['ID'])
        model = TrAdaBoostR2(lgbm(), n_estimators=7, Xt=Xt_lab, yt=yt_lab, random_state=0, verbose=0)
        model.fit(Xs, ys)
        y_pred = model.predict(Xt_unlab)
        unlab = pd.concat([unlab_id.reset_index(drop=True), yt_unlab.reset_index(drop=True), pd.DataFrame(y_pred, columns=['Yield_pred'])], axis=1, join='inner')
        unlab.to_csv(f'result/result_{seed}/pred_{i}.csv', index=False)
        r2_unlab.append(metrics.r2_score(yt_unlab, y_pred))
    
    print('Avg_R2 (test):', np.mean(r2_unlab))
    print('Max_R2 (test):', np.max(r2_unlab))
    print('Std_R2 (test):', np.std(r2_unlab, ddof=0))
    
    
    exclude_list1 = data_notused_id.tolist()
    exclude_list2 = data_used_id.tolist()

    r2_included = []
    r2_excluded = []
    for j in range(100):
        df_unlab = pd.read_csv(f'result/result_{seed}/pred_{j}.csv')
        filtered_df_included = df_unlab[~df_unlab['ID'].isin(exclude_list1)]
        filtered_df_excluded = df_unlab[~df_unlab['ID'].isin(exclude_list2)]
        r2_included.append(metrics.r2_score(filtered_df_included['Yield'], filtered_df_included['Yield_pred']))
        r2_excluded.append(metrics.r2_score(filtered_df_excluded['Yield'], filtered_df_excluded['Yield_pred']))
    
    print('Avg_R2 (contained in source domain):', np.mean(r2_included))
    print('Avg_R2 (out of sample):', np.mean(r2_excluded))
    print('============================================================')
# The result directory was deleted when the upload.   

Seed: 0
Avg_R2 (test): 0.5712585854096135
Max_R2 (test): 0.7427209337949369
Std_R2 (test): 0.10414232119431147
Avg_R2 (contained in source domain): 0.7103382389595012
Avg_R2 (out of sample): 0.11392816054001552
Seed: 1
Avg_R2 (test): 0.6089662125511269
Max_R2 (test): 0.8072640071484577
Std_R2 (test): 0.12625016558370908
Avg_R2 (contained in source domain): 0.6974370470754029
Avg_R2 (out of sample): -0.01714313598830036
Seed: 2
Avg_R2 (test): 0.5972852921400934
Max_R2 (test): 0.8164396181871374
Std_R2 (test): 0.1060128656985705
Avg_R2 (contained in source domain): 0.7035414532058156
Avg_R2 (out of sample): 0.21744951312100974
Seed: 3
Avg_R2 (test): 0.5931912954160394
Max_R2 (test): 0.8252963465484957
Std_R2 (test): 0.10430765744842571
Avg_R2 (contained in source domain): 0.7095957080974317
Avg_R2 (out of sample): 0.24113270675530768
Seed: 4
Avg_R2 (test): 0.5917623698535531
Max_R2 (test): 0.8315213939249353
Std_R2 (test): 0.14161567883472628
Avg_R2 (contained in source domain): 0.768224