# Invoke Task Command Generator

Generate task strings that can be copied to shell for experiment runs.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [3]:
folds = ['Fold1', 'Fold2', 'Fold3']

num_queries = []

for fold in folds:
    sim_exp_train_vali_rankings = pd.read_csv(f'../build/simulation/{fold}/sim_exp_train_vali_rankings.csv')
    nqids = sim_exp_train_vali_rankings['qid'].nunique()
    num_queries.append(nqids)

In [4]:
num_queries

[381]

### Sweep Tasks

In [5]:
tasks = []
for func_name, algo_name in [['sweep-causal-forests', 'cf_ltr'], ['sweep-cpbm-ltr', 'cpbm_ltr'], ['sweep_x_learner', 'xlearner']]:
    for i, fold in enumerate(folds):
        tot_num_queries = num_queries[i]
        for avg_clicks in [5, 10, 25, 50]:
            for nq in [tot_num_queries // j for j in [100, 10, 2, 1]]:
                for bs in [32]:
                    for reg_scale in [0.01, 0.02, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 20.0, 30.0]:
                        task = f'invoke {func_name} --avg-clicks={avg_clicks} --nq={nq} --total-nqueries={tot_num_queries} --fold={fold} --model-dir=build/simulation/{fold}/sweep --algorithm-name={algo_name} --train-weights-feature-name=137 --eval-weights-feature-name=137 --batch-size={bs} --reg-scale={reg_scale}'
                        tasks.append(task)

In [6]:
def schedule_tasks(n_parallel_groups, tasks, shuffle=True):
    if shuffle:
        np.random.shuffle(tasks)
    groups = np.array_split(tasks, n_parallel_groups)
    seq_exe_cmds = []
    for g in groups:
        cmd = f'{{ {" && ".join(g)};}}'
        seq_exe_cmds.append(cmd)
    return ' & '.join(seq_exe_cmds) + ' &'

In [7]:
schedule_tasks(2, ['2', '1', '3', '5'])

'{ 3 && 2;} & { 5 && 1;} &'

In [8]:
len(tasks)

480

In [9]:
schedule_tasks(10, tasks)

'{ invoke sweep-causal-forests --avg-clicks=50 --nq=381 --total-nqueries=381 --fold=Fold1 --model-dir=build/simulation/Fold1/sweep --algorithm-name=cf_ltr --train-weights-feature-name=137 --eval-weights-feature-name=137 --batch-size=32 --reg-scale=0.5 && invoke sweep-causal-forests --avg-clicks=10 --nq=190 --total-nqueries=381 --fold=Fold1 --model-dir=build/simulation/Fold1/sweep --algorithm-name=cf_ltr --train-weights-feature-name=137 --eval-weights-feature-name=137 --batch-size=32 --reg-scale=0.1 && invoke sweep-cpbm-ltr --avg-clicks=5 --nq=190 --total-nqueries=381 --fold=Fold1 --model-dir=build/simulation/Fold1/sweep --algorithm-name=cpbm_ltr --train-weights-feature-name=137 --eval-weights-feature-name=137 --batch-size=32 --reg-scale=5.0 && invoke sweep-cpbm-ltr --avg-clicks=5 --nq=3 --total-nqueries=381 --fold=Fold1 --model-dir=build/simulation/Fold1/sweep --algorithm-name=cpbm_ltr --train-weights-feature-name=137 --eval-weights-feature-name=137 --batch-size=32 --reg-scale=0.1 && i

### Train and Test tasks

Read the best parameters

In [41]:
folds

['Fold1', 'Fold2', 'Fold3']

In [48]:
tasks = []
for func_name, algo_name in [['train-and-test-cpbm-ltr', 'cpbm_clipped_ips_ltr']]:
    for i, fold in enumerate(folds):
        tot_num_queries = num_queries[i]
        model_dir = f'../build/simulation/{fold}/sweep'
        learning_rate = 0.1
        regularizer = 'l2'
        sweep_results = []
        for avg_clicks in [5, 10, 25, 50]:
            for nq in [tot_num_queries // i for i in [100, 10, 2, 1]]:
                for batch_size in [32]:
                    for reg_scale in [0.01, 0.02, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 20.0, 30.0]:
                        fpath = (Path(model_dir) / algo_name / f'avg_clicks_{avg_clicks}_nq_{nq}' / 
                                 f'params_{learning_rate}_{batch_size}_{regularizer}_{reg_scale}' / 'sweep_results.csv')
                        res = pd.read_csv(fpath)
                        res['avg_clicks'] = avg_clicks
                        res['nqueries'] = nq
                        sweep_results.append(res)
        sweep_results = pd.concat(sweep_results)
        sweep_results['rank'] = sweep_results.groupby(['algorithm', 'avg_clicks', 'nqueries'])['metric/mrr'].rank(method='min', ascending=False)
        best_results = sweep_results[sweep_results['rank'] == 1]
        best_results = best_results[['avg_clicks', 'nqueries', 'epoch', 'batch_size', 'reg_scale']].values
        for param in best_results:
            avg_clicks, nq, epoch, batch_size, reg_scale = param
            task = f'invoke {func_name} --avg-clicks={int(avg_clicks)} --nq={int(nq)} --total-nqueries={tot_num_queries} --fold={fold} --model-dir=build/simulation/{fold}/train_test/models --eval-result-dir=build/simulation/{fold}/train_test/test_results --algorithm-name={algo_name} --test-data-path=build/simulation/{fold}/sim_test.txt --train-weights-feature-name=137 --epochs={int(epoch)} --batch-size={int(batch_size)} --reg-scale={reg_scale}'
            tasks.append(task)

In [49]:
len(tasks)

48

In [50]:
schedule_tasks(16, tasks)

'{ invoke train-and-test-cpbm-ltr --avg-clicks=50 --nq=15896 --total-nqueries=15896 --fold=Fold1 --model-dir=build/simulation/Fold1/train_test/models --eval-result-dir=build/simulation/Fold1/train_test/test_results --algorithm-name=cpbm_clipped_ips --test-data-path=build/simulation/Fold1/sim_test.txt --train-weights-feature-name=137 --epochs=11 --batch-size=32 --reg-scale=0.05 && invoke train-and-test-cpbm-ltr --avg-clicks=5 --nq=15992 --total-nqueries=15992 --fold=Fold3 --model-dir=build/simulation/Fold3/train_test/models --eval-result-dir=build/simulation/Fold3/train_test/test_results --algorithm-name=cpbm_clipped_ips --test-data-path=build/simulation/Fold3/sim_test.txt --train-weights-feature-name=137 --epochs=24 --batch-size=32 --reg-scale=0.05 && invoke train-and-test-cpbm-ltr --avg-clicks=50 --nq=1596 --total-nqueries=15966 --fold=Fold2 --model-dir=build/simulation/Fold2/train_test/models --eval-result-dir=build/simulation/Fold2/train_test/test_results --algorithm-name=cpbm_clipp