# Get ROC AUC

Generalizing the executable `get_rocauc-nonparallel` into something that can be run on swarm. 

## Edits

- 2024-05-14: Added Llama 3

## Create config files

In [None]:
# models = [
#     'bert_base_uncased', 
#     'llama_7b', 
#     # 'llama_13b',
#     'mental_bert', 
#     'mental_longformer',
# ]

models = [
    'llama3_8b', 
    'llama3_8b_instruct', 
    'llama_13b'
]

datas = [
    'all', 
    'pt_noshort', 
    'turns'
]

# Commented features do not converge due to <3 subjects
features = [
    'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', # 'a8', 
    'b1', 'b2', 'b3', 
    'c1', 'c2', 'c3', 'c4', 
    'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 
    'e1', 'e2', 'e3', # 'e4', # 'e5', 
    'f1', 'f2', 'f3',
    'g1', 'g2', 
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 
    'any'
]

k_fold = 4
C_list = [
    0.0001, 0.0005, 
    0.001, 0.005, 
    0.01, 0.05, 
    0.1, 0.5, 
    1., 5., 
    10., 50., 
    100., 500., 
    1000., 5000.
]

trn_types = ['baseline', 'perm_trn']

read_dir = '/data/MLDSST/xinaw/impactme/results/raw'
write_dir = '/data/MLDSST/xinaw/impactme/results/roc_auc'

In [None]:
from pathlib import Path
import json
import itertools

config_dir = Path('/data/MLDSST/xinaw/impactme/config/roc_auc')
Path(config_dir).mkdir(exist_ok=True)
config_paths = []

params = list(itertools.product(datas, features, models, trn_types))                

for data, feature, model, trn_type in params:
    cfg_dat = dict(
        data=data,
        feature=feature,
        k_fold=k_fold,
        model=model,
        C_list=C_list,
        read_dir=read_dir,
        write_dir=write_dir, 
        trn_type=trn_type
    )
    cfg_path = config_dir / f'{trn_type}-{model}-{data}-{feature}'
    cfg_path.write_text(json.dumps(cfg_dat, indent=2))

## Check for existing files

In [None]:
import os
import numpy as np
import itertools

pkl_exists = []
res_f = []

params = list(itertools.product(datas, features, models, trn_types))                
need_rocauc = set([f'{trn_type}-{model}-{data}-{feature}' for data, feature, model, trn_type in params])

for trn_type in trn_types:
    temp = [i.replace('.pkl', '') for i in os.listdir(f'results/raw/{trn_type}')]
    pkl_exists += [trn_type + '-' + i for i in temp]

    temp = [i.replace('.csv', '') for i in os.listdir(f'results/roc_auc/{trn_type}')]
    res_f += [trn_type + '-' + i for i in temp]

need_rocauc = need_rocauc.intersection(pkl_exists) - set(res_f)
need_rocauc = list(need_rocauc)
print(f'ROC AUC calculations to be run: {len(need_rocauc)}')

### Test a single instance (optional)

We can run the first config file to do a quick pass on viability. 

In [None]:
need_rocauc[0]

In [None]:
config_path = '/data/MLDSST/xinaw/impactme/config/roc_auc'
# Run to test
! python /data/MLDSST/xinaw/impactme/roc_auc.py {config_path}/{need_rocauc[0]}

## Write to swarm

In [None]:
config_path = '/data/MLDSST/xinaw/impactme/config/roc_auc'

# Write the config paths to swarm
with open('swarm/roc_auc.swarm', 'w') as f:
    for i in need_rocauc:
        f.write(f'python /data/MLDSST/xinaw/impactme/roc_auc.py {config_path}/{i}\n')

## Run swarm

Because each calculation is very short, a large batch is fine. 

This should be a `swarm` command. It is fine to start with a low wall-time and capturing the timeouts later. For example, the first command may be

```
SBATCH_PARTITION=quick swarm --verbose 2 -g 2 -t 2 -b 32 --time=1 /data/MLDSST/xinaw/impactme/swarm/rocauc.swarm
```

### Check for missing files

For this step, things are only really likely to fail due to some strange exit rules in the batched processes. 

In [None]:
import os
import numpy as np
import itertools

pkl_exists = []
res_f = []

params = list(itertools.product(datas, features, models, trn_types))                
need_rocauc = set([f'{trn_type}-{model}-{data}-{feature}' for data, feature, model, trn_type in params])

for trn_type in trn_types:
    temp = [i.replace('.pkl', '') for i in os.listdir(f'results/raw/{trn_type}')]
    pkl_exists += [trn_type + '-' + i for i in temp]

    temp = [i.replace('.csv', '') for i in os.listdir(f'results/roc_auc/{trn_type}')]
    res_f += [trn_type + '-' + i for i in temp]

need_rocauc = need_rocauc.intersection(pkl_exists) - set(res_f)
need_rocauc = list(need_rocauc)
print(f'ROC AUC calculations to be run: {len(need_rocauc)}')

In [None]:
config_path = '/data/MLDSST/xinaw/impactme/config/roc_auc'

# Write the config paths to swarm
with open('swarm/roc_auc.swarm', 'w') as f:
    for i in need_rocauc:
        f.write(f'python /data/MLDSST/xinaw/impactme/roc_auc.py {config_path}/{i}\n')

An example swarm call is below:

```
SBATCH_PARTITION=quick swarm --verbose 2 -g 1 -t 1 -b 18 --time=1 /data/MLDSST/xinaw/impactme/swarm/roc_auc.swarm
```