# Inner Split Function

In [32]:
def confoundSplit(p_pos_train_z1, p_pos_train_z0, p_mix_z1, alpha_test):
    
    assert 0 <= p_pos_train_z1 <= 1
    assert 0 <= p_pos_train_z0 <= 1
    assert 0 <= p_mix_z1 <= 1
    assert alpha_test >= 0
    
    C_z = p_mix_z1
    
    p_mix_z0 = 1 - p_mix_z1
    
    # C_y = p_train(y=1) = p_train(z=0) * p_train(y=1|z=0) + p_train(z=1) * p_train(y=1|z=1) 
    # C_y = p_test(y=1) = p_test(z=0) * p_test(y=1|z=0) + p_test(z=1) * p_test(y=1|z=1)
    C_y = p_mix_z0 * p_pos_train_z0 + p_mix_z1 * p_pos_train_z1
    
    
    
    
    p_pos_test_z0 = C_y / (1 - (1-alpha_test) * C_z)
    p_pos_test_z1 = alpha_test * p_pos_test_z0
    
    
    return {"p_pos_train_z0": p_pos_train_z0,
            "p_pos_train_z1": p_pos_train_z1,
            "p_pos_train": C_y,
            "p_pos_test": C_y,
            "p_mix_z0": p_mix_z0,
            "p_mix_z1": p_mix_z1,
            "alpha_test": alpha_test,
            "p_pos_test_z0": p_pos_test_z0,
            "p_pos_test_z1": p_pos_test_z1,
            
            "C_y": C_y,
            "C_z": C_z,
           }
    

In [3]:
confoundSplit(p_pos_train_z0=0.6, p_pos_train_z1 = 0.2, p_mix_z1 = 0.5, alpha_test = 1)

{'p_pos_train_z0': 0.6,
 'p_pos_train_z1': 0.2,
 'p_pos_train': 0.4,
 'p_pos_test': 0.4,
 'p_mix_z0': 0.5,
 'p_mix_z1': 0.5,
 'alpha': 1,
 'p_pos_test_z0': 0.4,
 'p_pos_test_z1': 0.4,
 'C_y': 0.4,
 'C_z': 0.5}

In [4]:
confoundSplit(p_pos_train_z0=0.8, p_pos_train_z1 = 0.2, p_mix_z1 = 0.2, alpha_test = 1)

{'p_pos_train_z0': 0.8,
 'p_pos_train_z1': 0.2,
 'p_pos_train': 0.6800000000000002,
 'p_pos_test': 0.6800000000000002,
 'p_mix_z0': 0.8,
 'p_mix_z1': 0.2,
 'alpha': 1,
 'p_pos_test_z0': 0.6800000000000002,
 'p_pos_test_z1': 0.6800000000000002,
 'C_y': 0.6800000000000002,
 'C_z': 0.2}

In [5]:
confoundSplit(p_pos_train_z0=0.8, p_pos_train_z1 = 0.1, p_mix_z1 = 0.2, alpha_test = 1)

{'p_pos_train_z0': 0.8,
 'p_pos_train_z1': 0.1,
 'p_pos_train': 0.6600000000000001,
 'p_pos_test': 0.6600000000000001,
 'p_mix_z0': 0.8,
 'p_mix_z1': 0.2,
 'alpha': 1,
 'p_pos_test_z0': 0.6600000000000001,
 'p_pos_test_z1': 0.6600000000000001,
 'C_y': 0.6600000000000001,
 'C_z': 0.2}

In [6]:
confoundSplit(p_pos_train_z0=0.8, p_pos_train_z1 = 0.1, p_mix_z1 = 0.2, alpha_test = 2)

{'p_pos_train_z0': 0.8,
 'p_pos_train_z1': 0.1,
 'p_pos_train': 0.6600000000000001,
 'p_pos_test': 0.6600000000000001,
 'p_mix_z0': 0.8,
 'p_mix_z1': 0.2,
 'alpha': 2,
 'p_pos_test_z0': 0.5500000000000002,
 'p_pos_test_z1': 1.1000000000000003,
 'C_y': 0.6600000000000001,
 'C_z': 0.2}

# Outer Wrapper accepting two datasets and some extra parameters

In [7]:
import pandas as pd
import numpy as np

### WLS

In [8]:

df_wls = pd.read_csv("/edata/TRESTLE/testWLS.tsv", sep='\t')

df_wls_label = pd.read_csv("/edata/TRESTLE/WLS-labels.csv")

df_wls_merge = df_wls.merge(df_wls_label, left_on='file', right_on='idtlkbnk', how='inner')

df_wls_merge.rename(columns={"> 1 sd below mean for normals ages 60-79 (Tombaugh, Kozak, & Rees, 1999) -- normal cutoff = 12+ for 9-12 yrs eductation, 14+ for 13-21 yrs education":
                             "label",
                             
                            },
                    inplace=True
                   )

df_wls_merge.loc[df_wls_merge['label'] == 'y','label'] = 'Y'

condlist = [
    df_wls_merge['label'] == 'Y',
    df_wls_merge['label'] == 'N',
    df_wls_merge['label'].isna()
]
choicelist = [
    1,
    0,
    np.nan
]

df_wls_merge['label'] = np.select(condlist, choicelist)

df_wls_merge = df_wls_merge.loc[df_wls_merge['label'].notna(), :].reset_index(drop=True)

In [9]:
df_wls_merge.groupby('label', dropna=False).size()

label
0.0    1167
1.0     110
dtype: int64

### ADReSS

In [10]:
df_adress_train = pd.read_csv("/edata/ADReSS-IS2020-data/dataframes/adre_train.csv")

df_adress_test = pd.read_csv("/edata/ADReSS-IS2020-data/dataframes/adre_test.csv")

df_adress = pd.concat([df_adress_train, df_adress_test], ignore_index=True)

df_adress.rename(columns={"sentence": "text"}, inplace=True)

In [11]:
df_adress

Unnamed: 0.1,Unnamed: 0,sentence_source,label,text
0,6,S094.txt,1,oh yes a little girl and the little boy is ...
1,0,S138.txt,1,the the water's flowing on the floor and sh...
2,30,S118.txt,1,oh there's a cookie jar and a youngster with ...
3,37,S114.txt,1,mhm well the kids is robbing a cookie jar ...
4,49,S143.txt,1,well little boy reaching out for the cookie j...
...,...,...,...,...
151,43,S198.txt,1,you mean like the woman doing the dishes and ...
152,44,S180.txt,0,well the boy is taking cookies outof the cook...
153,45,S194.txt,1,well the mother has water spilling all over t...
154,46,S176.txt,1,whew do i hafta use my my my personal descr...


## Wrapper Function

In [12]:
(df_wls_merge['label'] == 0).sum()

1167

In [13]:
df_wls_merge.groupby('label', dropna=False).size()

label
0.0    1167
1.0     110
dtype: int64

In [14]:
confoundSplit(p_pos_train_z0=0.8, p_pos_train_z1 = 0.1, p_mix_z1 = 0.2, alpha_test = 2)

{'p_pos_train_z0': 0.8,
 'p_pos_train_z1': 0.1,
 'p_pos_train': 0.6600000000000001,
 'p_pos_test': 0.6600000000000001,
 'p_mix_z0': 0.8,
 'p_mix_z1': 0.2,
 'alpha': 2,
 'p_pos_test_z0': 0.5500000000000002,
 'p_pos_test_z1': 1.1000000000000003,
 'C_y': 0.6600000000000001,
 'C_z': 0.2}

In [15]:
import math


def confoundSplitNumbers(df0, df1, 
                    df0_label, df1_label,
                    p_pos_train_z1, p_pos_train_z0, p_mix_z1, alpha_test, 
                    train_test_ratio = 4,
                   ):
    
    """
    df0_label, df1_label: 0/1, or True/False coded
    
    
    """
    assert df0[df0_label].isin([0,1]).all(axis=0)
    assert df1[df1_label].isin([0,1]).all(axis=0)
    
    
    mix_param_dict = confoundSplit(p_pos_train_z0=p_pos_train_z0, p_pos_train_z1 = p_pos_train_z1, p_mix_z1 = p_mix_z1, alpha_test = alpha_test)
    
    
    
    N_df0_pos = (df0[df0_label] == 1).sum()
    N_df0_neg = (df0[df0_label] == 0).sum()
    
    N_df1_pos = (df1[df1_label] == 1).sum()
    N_df1_neg = (df1[df1_label] == 0).sum()
    
    N_df0 = N_df0_pos + N_df0_neg
    N_df1 = N_df1_pos + N_df1_neg
        
    
    n_df0_test_pos = math.floor(N_df0 / (train_test_ratio + 1))
    
    
    while(n_df0_test_pos > 0):
        
        n_df0_test_neg = math.floor(n_df0_test_pos / mix_param_dict['p_pos_test_z0'] * (1-mix_param_dict['p_pos_test_z0']))
        
        
        n_df0_train_pos = math.floor((n_df0_test_pos + n_df0_test_neg) * train_test_ratio * mix_param_dict['p_pos_train_z0'])
        n_df0_train_neg = math.floor((n_df0_test_pos + n_df0_test_neg) * train_test_ratio * (1 - mix_param_dict['p_pos_train_z0']))
        
        
        
        
        n_df1_train = math.floor(mix_param_dict['C_z'] / (1 - mix_param_dict['C_z']) * (n_df0_train_pos + n_df0_train_neg))
        n_df1_train_pos = math.floor(n_df1_train * mix_param_dict['p_pos_train_z1'])
        n_df1_train_neg = math.floor(n_df1_train * (1 - mix_param_dict['p_pos_train_z1']))
        
        n_df1_test = math.floor(n_df1_train/train_test_ratio)
        n_df1_test_pos = math.floor(n_df1_test * mix_param_dict['p_pos_test_z1'])
        n_df1_test_neg = math.floor(n_df1_test * (1 - mix_param_dict['p_pos_test_z1']))
        
        
        test1 = 0 < (n_df0_train_pos + n_df0_test_pos) <= N_df0_pos
        test2 = 0 < (n_df0_train_neg + n_df0_test_neg) <= N_df0_neg
        
        test3 = 0 < (n_df1_train_pos + n_df1_test_pos) <= N_df1_pos
        test4 = 0 < (n_df1_train_neg + n_df1_test_neg) <= N_df1_neg
        
        
        test5 = 0 < n_df0_train_pos
        test6 = 0 < n_df0_train_neg
        test7 = 0 < n_df1_train_pos
        test8 = 0 < n_df1_train_neg
        
        test9 = 0 < n_df0_test_pos
        test10 = 0 < n_df0_test_neg
        test11 = 0 < n_df1_test_pos
        test12 = 0 < n_df1_test_neg
        
        
        
        if test1 and test2 and test3 and test4 and test5 and test6 and test7 and test8 and test9 and test10 and test11 and test12:
            return {"n_df0_train_pos": n_df0_train_pos,
                    "n_df0_test_pos": n_df0_test_pos,
                    "n_df0_train_neg": n_df0_train_neg,
                    "n_df0_test_neg": n_df0_test_neg,
                    
                    "n_df1_train_pos": n_df1_train_pos,
                    "n_df1_test_pos":n_df1_test_pos,
                    "n_df1_train_neg":n_df1_train_neg,
                    "n_df1_test_neg":n_df1_test_neg,
                    
                   }
        else:        
            n_df0_test_pos -= 1
        
        if n_df0_test_pos == 0:
            return None

In [16]:
confoundSplit(p_pos_train_z0=0.8, p_pos_train_z1 = 0.1, p_mix_z1 = 0.2, alpha_test = 2)

{'p_pos_train_z0': 0.8,
 'p_pos_train_z1': 0.1,
 'p_pos_train': 0.6600000000000001,
 'p_pos_test': 0.6600000000000001,
 'p_mix_z0': 0.8,
 'p_mix_z1': 0.2,
 'alpha': 2,
 'p_pos_test_z0': 0.5500000000000002,
 'p_pos_test_z1': 1.1000000000000003,
 'C_y': 0.6600000000000001,
 'C_z': 0.2}

In [17]:

confoundSplitNumbers(df0=df_wls_merge, df1=df_adress, 
                    df0_label='label', df1_label='label',
                    
                    p_pos_train_z0=0.8, p_pos_train_z1 = 0.1, p_mix_z1 = 0.2, alpha_test = 2,
                    
                    train_test_ratio = 5,
                   )

In [18]:

confoundSplitNumbers(df0=df_wls_merge, df1=df_adress, 
                    df0_label='label', df1_label='label',
                    
                    p_pos_train_z0=0.8, p_pos_train_z1 = 0.1, p_mix_z1 = 0.2, alpha_test = 2,
                    
                    train_test_ratio = 1,
                   )



In [21]:
np.arange(0, 1, 0.1)

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

In [22]:
import itertools

In [23]:

valid_combinations = []

for combination in itertools.product([0.2], 
                                    [0.4], 
                                     np.arange(0.1, 0.999, 0.1),
                                     np.arange(0,10,0.1),
                                     [4]
                                     
                                    ):
    
    ret = confoundSplitNumbers(df0=df_wls_merge, df1=df_adress, 
                        df0_label='label', df1_label='label',

                        p_pos_train_z0=combination[0], p_pos_train_z1 = combination[1], p_mix_z1 = combination[2], alpha_test = combination[3],

                        train_test_ratio = combination[4],
                       )
    
    if ret is not None:
        valid_combinations.append(combination)
        
    

In [24]:
len(valid_combinations)

638

In [25]:
len([c for c in valid_combinations if (0.1 <= c[0] <= 0.2) and (0.6 <= c[1] <= 0.65) and (c[4] == 4)])

0

In [26]:
len([c for c in valid_combinations if (0.1 <= c[0] <= 0.3) and (0.6 <= c[1] <= 0.8) and (c[4] == 4) and (1 <= c[3] <= 1.5)])

0

In [27]:
valid_high_combinations = []

for combination in valid_combinations:

    ret = confoundSplitNumbers(df0=df_wls_merge, df1=df_adress, 
                            df0_label='label', df1_label='label',

                            p_pos_train_z0=combination[0], p_pos_train_z1 = combination[1], p_mix_z1 = combination[2], alpha_test = combination[3],

                            train_test_ratio = combination[4],
                           )
    
    if (ret['n_df0_train_pos'] >= 10) and (combination[4] == 4):
            valid_high_combinations.append(combination)
    
    
    

In [28]:
len(valid_high_combinations)

530

In [29]:
valid_high_combinations

[(0.2, 0.4, 0.1, 0.4, 4),
 (0.2, 0.4, 0.1, 0.5, 4),
 (0.2, 0.4, 0.1, 0.6000000000000001, 4),
 (0.2, 0.4, 0.1, 0.7000000000000001, 4),
 (0.2, 0.4, 0.1, 0.8, 4),
 (0.2, 0.4, 0.1, 0.9, 4),
 (0.2, 0.4, 0.1, 1.0, 4),
 (0.2, 0.4, 0.1, 1.1, 4),
 (0.2, 0.4, 0.1, 1.2000000000000002, 4),
 (0.2, 0.4, 0.1, 1.3, 4),
 (0.2, 0.4, 0.1, 1.4000000000000001, 4),
 (0.2, 0.4, 0.1, 1.5, 4),
 (0.2, 0.4, 0.1, 1.6, 4),
 (0.2, 0.4, 0.1, 1.7000000000000002, 4),
 (0.2, 0.4, 0.1, 1.8, 4),
 (0.2, 0.4, 0.1, 1.9000000000000001, 4),
 (0.2, 0.4, 0.1, 2.0, 4),
 (0.2, 0.4, 0.1, 2.1, 4),
 (0.2, 0.4, 0.1, 2.2, 4),
 (0.2, 0.4, 0.1, 2.3000000000000003, 4),
 (0.2, 0.4, 0.1, 2.4000000000000004, 4),
 (0.2, 0.4, 0.1, 2.5, 4),
 (0.2, 0.4, 0.1, 2.6, 4),
 (0.2, 0.4, 0.1, 2.7, 4),
 (0.2, 0.4, 0.1, 2.8000000000000003, 4),
 (0.2, 0.4, 0.1, 2.9000000000000004, 4),
 (0.2, 0.4, 0.1, 3.0, 4),
 (0.2, 0.4, 0.1, 3.1, 4),
 (0.2, 0.4, 0.1, 3.2, 4),
 (0.2, 0.4, 0.1, 3.3000000000000003, 4),
 (0.2, 0.4, 0.1, 3.4000000000000004, 4),
 (0.2, 0.4, 0.

In [30]:

combination= (0.2, 0.6, 0.6, 4.800000000000001, 4)

ret = confoundSplitNumbers(df0=df_wls_merge, df1=df_adress, 
                            df0_label='label', df1_label='label',

                            p_pos_train_z0=combination[0], p_pos_train_z1 = combination[1], p_mix_z1 = combination[2], alpha_test = combination[3],

                            train_test_ratio = combination[4],
                           )

In [31]:
ret

{'n_df0_train_pos': 11,
 'n_df0_test_pos': 2,
 'n_df0_train_neg': 44,
 'n_df0_test_neg': 12,
 'n_df1_train_pos': 49,
 'n_df1_test_pos': 12,
 'n_df1_train_neg': 32,
 'n_df1_test_neg': 7}

In [1]:
from sacred import Experiment


In [5]:
ex = Experiment("Test Exp>>>>",interactive=True)

In [6]:
ex

<sacred.experiment.Experiment at 0x7f1e67040ee0>

In [9]:
dir(ex)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_named_config',
 '_check_command',
 '_create_config_dict',
 '_create_run',
 '_handle_help',
 '_is_traversing',
 'add_artifact',
 'add_config',
 'add_named_config',
 'add_package_dependency',
 'add_resource',
 'add_source_file',
 'additional_cli_options',
 'additional_host_info',
 'all_cli_options',
 'automain',
 'base_dir',
 'capture',
 'captured_functions',
 'captured_out_filter',
 'command',
 'commands',
 'config',
 'config_hook',
 'config_hooks',
 'configurations',
 'current_run',
 'default_command',
 'dependencies',
 'doc',
 'gather_commands',
 'gather_named_configs',
 'get_default_options',
 'get_experiment_info

In [14]:
ex.path

'Test Exp>>>>'