### 1. Pretrain using the Barlow Twin method

In [1]:
import yaml

In [None]:
from modules.pretraining import main

In [None]:
# Use the config file to set the hyperparameters
config = yaml.load(open("config.yaml", "r"), Loader=yaml.FullLoader)
print(config)

main(config)

### 2. Finetune the model by loading in the pretrained model

In [None]:
import numpy as np
import os
from utils.plots import plot_srcc_MAE
from modules.finetune import main_finetune
from Modules.finetune import run

In [None]:

def run(task, main_folder, tune_from, runs, targets):
    '''Run the fine-tuning experiments for the given task.
    task: str, the name of the task
    main_folder: str, the folder to save the results
    tune_from: str, the name of the pre-trained model
    runs: int, the number of runs
    targets: int, the number of train splits to fine-tune on'''

    config = yaml.load(open("config_finetune.yaml", "r"), Loader=yaml.FullLoader)

    config['task_name'] = task

    if config["task_name"] == 'visc':
        config['dataset']['task'] = 'regression'
        config['dataset']['data_path'] = 'GNN_BT_Data/visc_data.csv'
        target_list = [['Viscosity_1', 'Viscosity_2', 'Viscosity_3', 'Viscosity_4', 'Viscosity_5', 'T1', 'T2', 'T3', 'T4', 'T5', 'Ln(A)', 'Ea/R', 'smiles']]
    elif config["task_name"] == 'cond':
        config['dataset']['task'] = 'regression'
        config['dataset']['data_path'] = 'GNN_BT_Data/cond_data.csv'
        target_list = [['K1', 'K2', 'K3', 'K4', 'K5', 'T1', 'T2', 'T3', 'T4', 'T5', 'Intercept', 'Coefficients', 'SMILES']]
    elif config["task_name"] == 'visc_hc':
        config['dataset']['task'] = 'regression'
        config['dataset']['data_path'] = 'GNN_BT_Data/visc_hc_data.csv'
        target_list = [['Viscosity_1', 'Viscosity_2', 'Viscosity_3', 'Viscosity_4', 'Viscosity_5', 'T1', 'T2', 'T3', 'T4', 'T5', 'Intercept (A)', 'Coefficient (B)', 'smiles']]
    else:
        raise ValueError('Undefined downstream task!')

    print(config)

    # save all the srcc and mae values in a dictionary
    # save results in a yaml file
    s_avg = np.zeros((targets, 5))
    s_std = np.zeros((targets, 5))
    m_avg = np.zeros((targets, 5))
    m_std = np.zeros((targets, 5))

    BT_s_avg = np.zeros((targets, 5))
    BT_s_std = np.zeros((targets, 5))
    BT_m_avg = np.zeros((targets, 5))
    BT_m_std = np.zeros((targets, 5))

    results = {
        'scratch': np.zeros((runs, targets, 5)).tolist(),
        'BT': np.zeros((runs, targets, 5)).tolist(),
        'scratch_mae': np.zeros((runs, targets, 5)).tolist(),
        'BT_mae': np.zeros((runs, targets, 5)).tolist(),
        'axis': np.zeros(targets).tolist()
    }

    for j in range(runs):
        folder = os.path.join(main_folder, f'test_{j+1}')
        os.makedirs(folder, exist_ok=False)
        
        scratch = np.zeros((targets, 5))
        BT = np.zeros((targets, 5))
        scratch_mae = np.zeros((targets, 5))
        BT_mae = np.zeros((targets, 5))
        axis = np.zeros(targets)

        a = 0
        for i in range(6, 6-targets, -1):
            print("start")
            config['dataset']['target'] = target_list[0]
            config['dataset']['train_size'] = (i+1)*0.1
            config['save_folder'] = folder
            config['fine_tune_from'] = tune_from
            config['name'] = f'BT_{(i+1)*0.1:.1f}'
            
            srcc_BT, mae_BT = main_finetune(config)
            BT[a] = srcc_BT
            BT_mae[a] = mae_BT

            config['fine_tune_from'] = 'None'
            config['name'] = f'Scratch_{(i+1)*0.1:.1f}'
            srcc_s, mae_s = main_finetune(config)
            scratch[a] = srcc_s
            scratch_mae[a] = mae_s
            

            if config["task_name"] == 'visc':
                axis[a] = int((i+1)*0.1*477)
            elif config["task_name"] == 'cond':
                axis[a] = int((i+1)*0.1*1222)
            elif config["task_name"] == 'visc_hc':
                axis[a] = int((i+1)*0.1*182)

            
            print("done")
        
        for i in range(5):
            plot_srcc_MAE(scratch[:, i], BT[:, i], s_std[:, i], BT_s_std[:, i], axis, f'{i+1}', folder, tag2='srcc')
            plot_srcc_MAE(scratch_mae[:, i], BT_mae[:, i], m_std[:, i], BT_m_std[:, i], axis, f'{i+1}', folder, tag2='mae')

        results['scratch'][j] = scratch.tolist()
        results['BT'][j] = BT.tolist()
        results['scratch_mae'][j] = scratch_mae.tolist()
        results['BT_mae'][j] = BT_mae.tolist()
        results['axis'] = axis.tolist()

        s_avg += scratch
        m_avg += scratch_mae
        BT_s_avg += BT
        BT_m_avg += BT_mae

    s_avg /= runs
    m_avg /= runs
    BT_s_avg /= runs
    BT_m_avg /= runs

    if runs > 1:
        s_std = np.std(np.array(results['scratch']), axis=0)
        m_std = np.std(np.array(results['scratch_mae']), axis=0)
        BT_s_std = np.std(np.array(results['BT']), axis=0)
        BT_m_std = np.std(np.array(results['BT_mae']), axis=0)

        for i in range(5):
            plot_srcc_MAE(s_avg[:, i], BT_s_avg[:, i], s_std[:, i], BT_s_std[:, i], axis, f'{i+1}', main_folder, tag2='srcc', tag3 = 'avg')
            plot_srcc_MAE(m_avg[:, i], BT_m_avg[:, i], m_std[:, i], BT_m_std[:, i], axis, f'{i+1}', main_folder, tag2='mae', tag3 = 'avg')

    # Save results to YAML file
    with open(f"{main_folder}/results.yaml", "w") as file:
        yaml.dump(results, file)


In [None]:
# use the finetune config_finetune file to change the rest of the hyperparameters
# changes are made to the config file based on the infomation below. 

# lists used to automate the process of finetuning multiple models
task = ['visc']   # 'cond', 'visc', 'visc_hc'
tune_from = ['BT1']  # BT1: 15% subgraph, BT2: 15% subgraph, 20% node/edge
runs = [1]     # number of tests to run with the same hyperparameters
targets = [1]      # number of train set size splits to use (max is 7) - 0.1, 0.2, 0.3...0.7
main_folder = ['results']  # folder to save the results

for i in range(len(main_folder)):
    run(task[i], f"finetune/{main_folder[i]}", tune_from[i], runs[i], targets[i])