# Parallel logistic regression

Performed on the NIH Biowulf cluster. 

## Create config files

Below are all the parameters to be generated in config files. 

In [None]:
datas = [
    'all', 
    'pt_noshort', 
    'turns'
]
# Commented features do not converge due to <3 subjects
features = [
    'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', # 'a8', 
    'b1', 'b2', 'b3', 
    'c1', 'c2', 'c3', 'c4', 
    'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 
    'e1', 'e2', 'e3', 'e4', # 'e5', 
    'f1', 'f2', 'f3',
    'g1', 'g2', 
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 
    'any'
]

seed = 116
n_parallel = 60
max_iter = 5000 # Use 1000 for testing
k_fold = 4

models = [
    'bert_base_uncased', 
    'llama_7b',
    'llama_13b',
    'mental_bert', 
    'mental_longformer', 
    # 'roberta'
]
embed = 'last_avg'

C_list = [
    0.0001, 0.0005, 
    0.001, 0.005, 
    0.01, 0.05, 
    0.1, 0.5, 
    1., 5., 
    10., 50., 
    100., 500., 
    1000., 5000.
]

In [None]:
from pathlib import Path
import json

config_dir = Path('/data/MLDSST/xinaw/impactme/config/baseline')
config_dir.mkdir(exist_ok=True)

config_paths = []
for data in datas:
    for feature in features:
        for m in models:
            cfg_dat = dict(
                data=data,
                feature=feature,
                seed=seed,
                n_parallel=n_parallel,
                max_iter=max_iter,
                k_fold=k_fold,
                model_f=m,
                embed=embed,
                C_list=C_list,
            )
            cfg_path = config_dir / f'{m}-{data}-{feature}-config'
            cfg_path.write_text(json.dumps(cfg_dat, indent=2))
            # Uncomment the below to generate every combination of configs to run
            config_paths.append(cfg_path) 

### Select config paths (optional)

Although we could run every config by uncommenting the line in the 'Create config files' section, often we only want to regenerate a small set of the logistic regression fits.

In [None]:
datas = [
    'all', 
    'pt_noshort', 
    'turns'
]

# Commented features do not converge due to <3 subjects
features = [
    'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', # 'a8', 
    'b1', 'b2', 'b3', 
    'c1', 'c2', 'c3', 'c4', 
    'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 
    'e1', 'e2', 'e3', # 'e4', # 'e5', 
    'f1', 'f2', 'f3',
    'g1', 'g2', 
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 
    'any'
]

models = [
    # 'llama_7b',
    'bert_base_uncased', 
    'mental_bert', 
    'mental_longformer', 
    'roberta'
]

In [None]:
import itertools
from pathlib import Path

config_dir = Path('/data/MLDSST/xinaw/impactme/config')
config_paths = []

params = list(itertools.product(models, datas, features))

for model, data, feature in params:
    cfg_path = config_dir / f'{model}-{data}-{feature}-config'
    config_paths.append(cfg_path)

### Test the fit (optional)

In [None]:
config_paths[0]

In [None]:
# Run to test
# Takes around 1 min
! python /data/MLDSST/xinaw/impactme/lr.py {config_paths[0]}

In [None]:
# Confirm that the results look fine
import pickle

with open('results/raw/bert_base_uncased-all-a1.pkl', 'rb') as f:
    res = pickle.load(f)

In [None]:
# Check the structure
res

### Write to swarm

In [None]:
# Write the config paths to swarm
with open('swarm/baseline.swarm', 'w') as f:
    for i in config_paths:
        f.write(f'python /data/MLDSST/xinaw/impactme/baseline.py {i}\n')

## Run swarm

This should be a `swarm` command. It is fine to start with a low wall-time and capturing the timeouts later. For example, the first command may be

```
SBATCH_PARTITION=quick swarm --verbose 4 -g 32 -t 64 -b 6 --time=15 /data/MLDSST/xinaw/impactme/swarm/baseline.swarm
```

As a rule of thumb, for non Llama models (which range from hidden dimension 512 for Longformer to 768 for BERT), the amount of GB needed is 1/2 the number of threads.

### Check for missing files

Results can fail to process for a variety of reasons, including

- Timeout (or slow convergence)
- Exceeding allocated memory or CPUs, causing job termination
- Misspecification of a column (this has to be fixed earlier in the pipeline)

Although most jobs stay within 6 GB of memory, a few will start to creep into requiring 8 GB. 

To capture the missing results, often allocating more memory, more threads, and more walltime will fix things. 

Additionally, failure to fit may occur. This occurs with A8, which often yields combinations of folds with no positive labels. 

In [None]:
# Find missing results
import os
import itertools

res_f = os.listdir('results/raw/baseline')
res_combos = [f'{i[0]}-{i[1]}-{i[2]}.pkl' for i in itertools.product(models, datas, features)]
res_miss = [i.replace('.pkl', '') for i in set(res_combos) - set(res_f)]

# Write the config paths to swarm
with open('swarm/baseline-2.swarm', 'w') as f:
    for i in res_miss:
        f.write(f'python /data/MLDSST/xinaw/impactme/baseline.py /data/MLDSST/xinaw/impactme/config/{i}-config\n')

# Count the number of missing files
print(f'Missing {len(res_miss)} results.')

For features that do not converge, manual removal from the `timeout.swarm` file is needed. E5, for example, only has 2 subjects, and is therefore eliminated. 

For files that continue to timeout, it may be helpful to increase the number of parallel jobs and the thread allotment. Increasing from 16 to 32 often provides enough CPU hours to converge.

The number of threads should roughly correspond to the number of parallel jobs. Occasionally, some overlap between batched jobs can occur, leading to the job taking more cores than allocated. Unlike memory overhead, this does not appear to terminate the entire job.

```
SBATCH_PARTITION=quick swarm --verbose 4 -g 32 -t 32 --time=30 -b 4 /data/MLDSST/xinaw/impactme/swarm/timeout.swarm
SBATCH_PARTITION=quick swarm --verbose 4 -g 64 -t 64 --time=60 /data/MLDSST/xinaw/impactme/swarm/timeout.swarm
```

## Llama instructions 

*WARNING*: Do not run Llama alongside non-Llama jobs in swarm. 


### Parameters

In [None]:
models = [
    'llama3_8b', 
    'llama3_8b_instruct', 
    # 'llama_70b'
]

datas = [
    'all', 
    'pt_noshort', 
    'turns'
]

features = [
    'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', # 'a8', 
    'b1', 'b2', 'b3', 
    'c1', 'c2', 'c3', 'c4', 
    'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 
    'e1', 'e2', 'e3', # 'e4', # 'e5', 
    'f1', 'f2', 'f3',
    'g1', 'g2', 
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 
    'any'
]

llama_path = '/data/MLDSST/xinaw/impactme/config/baseline'
llama_path = Path(llama_path)

### Increase parallel jobs

The max number of parallelizations is 256, but at this level, the amount of memory needed (256 GB) exceeds that available on a node on the quick partition. 

As a rule of thumb, the amount of memory, in GB, needed during swarm is 1:1 with the number of threads and, if possible, it might be safer to use 1.5:1.

Additionally, it might be prudent to run 1 or 2 fewer parallel jobs than the number of threads allocated. 

In [None]:
from pathlib import Path
import json
import itertools

llama_path = '/data/MLDSST/xinaw/impactme/config/baseline'
Path(llama_path).mkdir(exist_ok=True)
llama_path = Path(llama_path)

llama_paths = []

seed = 116
n_parallel = 126
max_iter = 5000 # Use 1000 for testing
k_fold = 4
embed = 'last_avg'

C_list = [
    0.0001, 0.0005, 
    0.001, 0.005, 
    0.01, 0.05, 
    0.1, 0.5, 
    1., 5., 
    10., 50., 
    100., 500., 
    1000., 5000.
]

params = list(itertools.product(models, datas, features))
for model, data, feature in params:
    cfg_dat = dict(
        data=data,
        feature=feature,
        seed=seed,
        n_parallel=n_parallel, 
        max_iter=max_iter,
        k_fold=k_fold,
        model_f=model,
        embed=embed,
        C_list=C_list
    )
    cfg_path = llama_path / f'{model}-{data}-{feature}-config'
    cfg_path.write_text(json.dumps(cfg_dat, indent=2))
    llama_paths.append(cfg_path) 

### Check for missing files

Use the same parameters. 

In [None]:
import os
import itertools
from pathlib import Path

res_f = os.listdir('results/raw/baseline')
res_combos = [f'{i[0]}-{i[1]}-{i[2]}.pkl' for i in itertools.product(models, datas, features)]
res_miss = [i.replace('.pkl', '') for i in set(res_combos) - set(res_f)]

with open('swarm/baseline-llama3.swarm', 'w') as f:
    for i in res_miss:
        f.write(f'python /data/MLDSST/xinaw/impactme/baseline.py {llama_path}/{i}-config\n')

print(f'Missing {len(res_miss)} Llama files.')

An example call to run Llama in swarm:

```
SBATCH_PARTITION=quick swarm --verbose 4 -g  128 -t 128 -b 2 --time=120 /data/MLDSST/xinaw/impactme/swarm/baseline-llama3.swarm
```

### (Optional) Partition training

In some cases, it may not be possible for the fit to converge, even with 128 parallel threads. In this case, partitioning the training is suggested. 

Diving the `C_list` into two allows for use of essentially 256 parallel jobs. 

Try this as a second option. 

In [None]:
import itertools
from pathlib import Path
from os.path import exists
import json

llama_path = '/data/MLDSST/xinaw/impactme/config/baseline_parts'
Path(llama_path).mkdir(exist_ok=True)
llama_path = Path(llama_path)

seed = 116
n_parallel = 64
max_iter = 4000 # Use 1000 for testing
k_fold = 4
embed = 'last_avg'

C_lists = [
    [0.0001, 0.0005, 0.001, 0.005],
    [0.01, 0.05, 0.1, 0.5], 
    [1., 5., 10., 50.],
    [100., 500., 1000., 5000.]
]

C_parts = range(len(C_lists))

datas = [
    'all', 
    'pt_noshort', 
    'turns'
]

features = [
    'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', # 'a8', 
    'b1', 'b2', 'b3', 
    'c1', 'c2', 'c3', 'c4', 
    'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 
    'e1', 'e2', 'e3', # 'e4', # 'e5', 
    'f1', 'f2', 'f3',
    'g1', 'g2', 
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 
    'any'
]

models = [
    'llama3_8b', 
    'llama3_8b_instruct', 
    # 'llama_70b'
]


In [None]:
res_path = Path('results/raw/baseline')
res_parts_path = Path('results/raw/baseline_parts')
parts_paths = []

models = [
    'llama3_8b', 
    'llama3_8b_instruct', 
    'llama_13b'
]

params = list(itertools.product(models, datas, features, C_parts))
for model, data, feature, C_part in params:

    if exists(res_path / f'{model}-{data}-{feature}.pkl'):
        continue

    if exists(res_parts_path / f'{model}-{data}-{feature}-{C_part}.pkl'):
        continue

    cfg_dat = dict(
        data=data,
        feature=feature,
        seed=seed,
        n_parallel=n_parallel, 
        max_iter=max_iter,
        k_fold=k_fold,
        model_f=model,
        embed=embed,
        C_list=C_lists[C_part],
        C_part=C_part
    )
    cfg_path = llama_path / f'{model}-{data}-{feature}-{C_part}-config'
    cfg_path.write_text(json.dumps(cfg_dat, indent=2))
    parts_paths.append(cfg_path) 

print(f'Missing {len(parts_paths)} partitioned files.')

In [None]:
with open('swarm/baseline_parts.swarm', 'w') as f:
    for i in parts_paths:
        f.write(f'python /data/MLDSST/xinaw/impactme/baseline_parts.py {i}\n')

#### Concatenate partitions

WARNING: ONLY RUN THIS CODE IF THERE ARE NO MISSING FILES ANYMORE. 

In [None]:
from pathlib import Path
import pandas as pd
import pickle

res_path = Path('results/raw/baseline')
res_parts_path = Path('results/raw/baseline_parts')
parts_paths = []

params = list(itertools.product(models, datas, features, C_parts))

for model, data, feature, C_part in params:
    res_dict = {}

    if exists(res_path / f'{model}-{data}-{feature}.pkl'):
        continue
    else: # Use the 0th partition as template
        with open(res_parts_path / f'{model}-{data}-{feature}-0.pkl', 'rb') as f:
            res_dict = pickle.load(f)
    
    outputs_list = []
    
    for C_part in range(len(C_lists)):
        with open(res_parts_path / f'{model}-{data}-{feature}-{C_part}.pkl', 'rb') as f:
            temp = pickle.load(f)
            outputs_list.append(temp['outputs'])

    res_dict['outputs'] = pd.concat(outputs_list, ignore_index=True)

    # Write the full file
    with open(res_path / f'{model}-{data}-{feature}.pkl', 'wb') as f:
        pickle.dump(res_dict, f)
    
    del res_dict

## Permutations tests

### Test 1: Permutating all labels at the start

When loading the data from the features:

1. All labels are permuted
2. The outer k fold cross-validation is performed
3. The inner k-1 fold cross-validation is performed

This type of permutation tests how effective training on uninformative, stable inputs is for the prediction of uninformative, stable outputs. That is, it should illustrate the peak modeling performance on a arbitrary labeling system with the same positive/negative ratio as our original data. 

We can use the same config files but a different executable. 

We will delay executing Llama for now, as its compute requirements are slightly different from the BERT models. 

In [None]:
datas = [
    'all', 
    'pt_noshort', 
    'turns'
]

# Commented features do not converge due to <3 subjects
features = [
    'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', # 'a8', 
    'b1', 'b2', 'b3', 
    'c1', 'c2', 'c3', 'c4', 
    'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 
    'e1', 'e2', 'e3', # 'e4', # 'e5', 
    'f1', 'f2', 'f3',
    'g1', 'g2', 
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 
    'any'
]

models = [
    # llama_7b,
    'bert_base_uncased', 
    'mental_bert', 
]

In [None]:
import itertools
from pathlib import Path

config_dir = Path('/data/MLDSST/xinaw/impactme/config')
perm_all_paths = []

params = list(itertools.product(models, datas, features))

for model, data, feature in params:
    cfg_path = config_dir / f'{model}-{data}-{feature}-config'
    perm_all_paths.append(cfg_path)

# Write the config paths to swarm
with open('swarm/perm_all.swarm', 'w') as f:
    for i in config_paths:
        f.write(f'python /data/MLDSST/xinaw/impactme/perm_all.py {i}\n')

### Test 2: Permutation on training sets

1. Labels are read as normal
2. During inner cross-validation, the training y are permuted. 
3. During outer cross-validation, the training y are permuted.


In [None]:
import itertools
from pathlib import Path

config_dir = Path('/data/MLDSST/xinaw/impactme/config/baseline')
perm_trn_paths = []

datas = [
    'all', 
    'pt_noshort', 
    'turns'
]

# Commented features do not converge due to <3 subjects
features = [
    'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', # 'a8', 
    'b1', 'b2', 'b3', 
    'c1', 'c2', 'c3', 'c4', 
    'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 
    'e1', 'e2', 'e3', # 'e4', # 'e5', 
    'f1', 'f2', 'f3',
    'g1', 'g2', 
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 
    'any'
]

models = [
    'bert_base_uncased', 
    'llama_7b', 
    'llama_13b',
    'mental_bert', 
    'mental_longformer'
]
    
params = list(itertools.product(models, datas, features))

for model, data, feature in params:
    cfg_path = config_dir / f'{model}-{data}-{feature}-config'
    perm_trn_paths.append(cfg_path)

# Write the config paths to swarm
with open('swarm/perm_trn-llama.swarm', 'w') as f:
    for i in perm_trn_paths:
        f.write(f'python /data/MLDSST/xinaw/impactme/perm_trn.py {i}\n')


#### Check for missing files

The first portion checks for non-Llama models. 

In [None]:
# Find missing results
import os
import itertools

models = [
    'bert_base_uncased', 
    'mental_bert', 
    'mental_longformer', 
]
        
res_f = os.listdir('results/raw/perm_trn')
res_combos = [f'{i[0]}-{i[1]}-{i[2]}.pkl' for i in itertools.product(models, datas, features)]
res_miss = [i.replace('.pkl', '') for i in set(res_combos) - set(res_f)]

# Write the config paths to swarm
with open('swarm/perm_trn.swarm', 'w') as f:
    for i in res_miss:
        f.write(f'python /data/MLDSST/xinaw/impactme/perm_trn.py /data/MLDSST/xinaw/impactme/config/baseline/{i}-config\n')

# Count the number of missing files
print(f'Missing {len(res_miss)} results.')

An example call to run Llama in swarm:

```
SBATCH_PARTITION=quick swarm --verbose 4 -g  128 -t 128 -b 4 --time=60 /data/MLDSST/xinaw/impactme/swarm/perm_trn-llama.swarm
```

#### Missing Llama files

Missing Llama results should be allocated more walltime and run separately. 

In [None]:
# Find missing results
import os
import itertools

datas = [
    'all', 
    'pt_noshort', 
    'turns'
]

features = [
    'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', # 'a8', 
    'b1', 'b2', 'b3', 
    'c1', 'c2', 'c3', 'c4', 
    'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 
    'e1', 'e2', 'e3', # 'e4', # 'e5', 
    'f1', 'f2', 'f3',
    'g1', 'g2', 
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 
    'any'
]

models = [
    'llama3_8b', 
    'llama3_8b_instruct'
]

res_f = os.listdir('results/raw/perm_trn')
res_combos = [
    f'{i[0]}-{i[1]}-{i[2]}.pkl' for i in itertools.product(models, datas, features)
]
res_miss = [i.replace('.pkl', '') for i in set(res_combos) - set(res_f)]

# Write the config paths to swarm
with open('swarm/perm_trn_parts-llama3.swarm', 'w') as f:
    for i in res_miss:
        f.write(f'python /data/MLDSST/xinaw/impactme/perm_trn.py /data/MLDSST/xinaw/impactme/config/baseline/{i}-config\n')

# NOT ALL
# Count the number of missing files
print(f'Missing {len(res_miss)} results.')
print(res_miss)

##### (Optional) Parts

Given a couple of permutation runs, it may be necessary to partition the remaining permutations. 

In [None]:
import itertools
from pathlib import Path
from os.path import exists
import json

llama_path = '/data/MLDSST/xinaw/impactme/config/perm_trn_parts'
Path(llama_path).mkdir(exist_ok=True)
llama_path = Path(llama_path)

seed = 116
n_parallel = 64
max_iter = 4000 # Use 4000 for parts
k_fold = 4
embed = 'last_avg'

C_lists = [
    [0.0001, 0.0005, 0.001, 0.005],
    [0.01, 0.05, 0.1, 0.5], 
    [1., 5., 10., 50.],
    [100., 500., 1000., 5000.]
]

C_parts = range(len(C_lists))

models = [
    'llama3_8b', 
    'llama3_8b_instruct'
]


In [None]:
from pathlib import Path
import itertools

res_path = Path('results/raw/perm_trn')
res_parts_path = Path('results/raw/perm_trn_parts')
parts_paths = []

params = list(itertools.product(models, datas, features, C_parts))
for model, data, feature, C_part in params:

    if exists(res_path / f'{model}-{data}-{feature}.pkl'):
        continue

    if exists(res_parts_path / f'{model}-{data}-{feature}-{C_part}.pkl'):
        continue

    cfg_dat = dict(
        data=data,
        feature=feature,
        seed=seed,
        n_parallel=n_parallel, 
        max_iter=max_iter,
        k_fold=k_fold,
        model_f=model,
        embed=embed,
        C_list=C_lists[C_part],
        C_part=C_part
    )
    cfg_path = llama_path / f'{model}-{data}-{feature}-{C_part}-config'
    cfg_path.write_text(json.dumps(cfg_dat, indent=2))
    parts_paths.append(cfg_path) 

print(f'Missing {len(parts_paths)} files.')

In [None]:
with open('swarm/perm_trn_parts-llama3.swarm', 'w') as f:
    for i in parts_paths:
        f.write(f'python /data/MLDSST/xinaw/impactme/perm_trn_parts.py {i}\n')