# Logistic regression tuning

Data were originally tuned in `curium`. 

## Parameters

In [None]:
models = [
    'bert_base_uncased', 
    'llama_7b', 
    # 'llama_13b',
    'mental_bert', 
    'mental_longformer',
]

models = [
    'llama3_8b', 
    'llama3_8b_instruct', 
    'llama_13b'
]

datas = [
    'all', 
    'pt_noshort', 
    'turns'
]

# Commented features do not converge due to <3 subjects
features = [
    'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', # 'a8', 
    'b1', 'b2', 'b3', 
    'c1', 'c2', 'c3', 'c4', 
    'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 
    'e1', 'e2', 'e3', # 'e4', # 'e5', 
    'f1', 'f2', 'f3',
    'g1', 'g2', 
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 
    'any'
]

k = 4
C_list = [
    0.0001, 0.0005, 
    0.001, 0.005, 
    0.01, 0.05, 
    0.1, 0.5, 
    1., 5., 
    10., 50., 
    100., 500., 
    1000., 5000.
]

trn_types = ['baseline', 'perm_trn']
trn_types = ['baseline']

read_dir = '/data/MLDSST/xinaw/impactme/results/raw'
write_dir = '/data/MLDSST/xinaw/impactme/results/tuned'

## Tuning

Tuning selects the best C value for each fold. 

In [None]:
import itertools
import numpy as np
import pandas as pd

results = []

params = list(itertools.product(trn_types, models, datas, features))

for trn_type, model, data, feature in params:
    try:
        df = pd.read_csv(f'results/roc_auc/{trn_type}/{model}-{data}-{feature}.csv')
    # If file isn't found, continue
    except FileNotFoundError:
        print(f'File not found: {trn_type}, {model}-{data}-{feature}')
        continue
    inner_df = df[df['inner_fold'] > -1]
    del inner_df['inner_fold']
    inner_df = inner_df.groupby(['outer_fold', 'C']).mean()
    outer_df = inner_df.groupby(['outer_fold']).agg(np.argmax).reset_index()
    best_C = []

    for i in outer_df['roc_auc'].values:
        best_C.append(C_list[i])
    outer_df['best_C'] = best_C
    outer_df['roc_auc'] = outer_df.apply(
        lambda row: df.loc[(df['outer_fold'] == row['outer_fold']) & (df['C'] == row['best_C']) & (df['inner_fold'] == -1)]['roc_auc'].values[0], 
        axis=1
    )
    outer_df['model'] = model
    outer_df['feature'] = feature
    outer_df['data'] = data
    outer_df['trn_type'] = trn_type

    results.append(outer_df)

tuned_df = pd.concat(results)
tuned_df.head(10)

### Save results

In [None]:
from pathlib import Path

tuned_dir = Path('results/tuned')
tuned_dir.mkdir(parents=True, exist_ok=True)

tuned_df.to_csv(tuned_dir / 'roc_auc-kfold-0515.csv', index=False)

## Calculate ROC AUC values

### Create the concatenated file

First, a file with only the best results form each outer fold is generated. The following takes around 5 minutes to run, mostly due to the `pickle` reads.

In [None]:
import itertools
from tqdm import tqdm
from pathlib import Path
import pickle
import pandas as pd
from os.path import exists

# Don't bother with perm_trn for now
trn_types = ['baseline']
params = list(itertools.product(datas, features, models, trn_types))                
tuned = pd.read_csv('results/tuned/roc_auc-kfold-0515.csv')
res_dir = Path('results/raw')
k = 4

concat_best = []

for data, feature, model, trn_type in tqdm(params):

    if not exists(res_dir / f'{trn_type}/{model}-{data}-{feature}.pkl'):
        continue

    with open(res_dir / f'{trn_type}/{model}-{data}-{feature}.pkl', 'rb') as f:
        res = pickle.load(f)
    res = res['outputs']

    df = tuned[
        (tuned['model'] == model) &
        (tuned['data'] == data) &
        (tuned['feature'] == feature) &
        (tuned['trn_type'] == trn_type)
    ]

    temp = []

    for i in range(k):
        best_C = df[df['outer_fold'] == i]['best_C'].values[0]
        temp.append(
            res[
                (res['C'] == best_C) &
                (res['outer_fold'] == i) &
                (res['inner_fold'] == -1)
            ]
        )
    
    temp = pd.concat(temp, ignore_index=True)
    temp['model'] = model
    temp['trn_type'] = trn_type
    
    concat_best.append(temp)

In [None]:
concat_best[810].tail()

In [None]:
concat_best_df = pd.concat(concat_best, ignore_index=True)
concat_best_df.to_csv('results/tuned/concat-0515.csv', index=False)

### Calculations

Proceeds like the above, but within swarm, since each ROC AUC calculation takes a few seconds. Because reading the concatenated file takes a while, we should not parallelize the ROC AUC calculations too much, as it'd be uneconomical to spend so much time reading the file.

In [None]:
from pathlib import Path
import json
import itertools

config_dir = Path('/data/MLDSST/xinaw/impactme/config/roc_auc-concat')
Path(config_dir).mkdir(exist_ok=True)
config_paths = []

params = list(itertools.product(datas, features))                

for data, feature in params:
    cfg_dat = dict(
        data=data,
        feature=feature,
        models=models,
        C_list=C_list,
        read_dir=read_dir,
        write_dir=write_dir, 
        trn_types=trn_types, 
    )
    cfg_path = config_dir / f'{data}-{feature}'
    cfg_path.write_text(json.dumps(cfg_dat, indent=2))
    config_paths.append(cfg_path)

In [None]:
with open('swarm/roc_auc-concat.swarm', 'w') as f:
    for i in config_paths:
        f.write(f'python /data/MLDSST/xinaw/impactme/roc_auc-concat.py {i}\n')

We can then concatenate the ROC AUC outputs. 

In [None]:
import itertools
import pandas as pd

params = list(itertools.product(datas, features))
roc_auc_df = []          
res_dir = Path('results/tuned/roc_auc-concat')      

for data, feature in params:
    roc_auc_df.append(pd.read_csv(res_dir / f'{data}-{feature}.csv'))

roc_auc_df = pd.concat(roc_auc_df, ignore_index=True)
roc_auc_df.head()

In [None]:
roc_auc_df.to_csv('results/tuned/roc_auc-concat.csv', index=False)