In [56]:
%load_ext autoreload
%autoreload 2

import robi
import numpy as np
import pandas as pd
from tqdm import tqdm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [53]:
def create_new_synthetic_dataset():
    params = {
        'n_samples': np.random.randint(2, 1000),
        'censoring': np.round(np.random.uniform(0.1, 1.),1),
        'nb_features': np.random.randint(2, 50),
        'noise': np.round(np.random.uniform(0., 20.),0),
    }
    params['n_informative'] = np.random.randint(2, params['nb_features'])
    params['effective_rank'] = np.random.randint(1, params['n_informative'])

    params['n_samples'] = int(params['n_samples']/10)*10

    df, coef = robi.utils.new_synthetic_dataset(**params)
    return df, coef, params


def do_trial(n_workers, n_uni_pval, n_fp_estimate, device):

    df, coef, params = create_new_synthetic_dataset()

    all_res = []
    for max_corr in [0.5, 1]:
        res, scores = robi.make_selection(df,
                                        candidates=np.arange(len(coef)).astype('str'),
                                        targets = {
                                          'time': ('time', 'event'),
                                        },
                                        n_workers=n_workers,
                                        n_uni_pval=n_uni_pval,
                                        n_fp_estimate=n_fp_estimate,
                                        verbose=False,
                                        max_corr_cluster=max_corr,
                                        device=device)

        res['coef_sel'] = [coef[np.array(x).astype('int64')].tolist() for x in res['selected']]
        res['actual_nfp'] = [(np.array(x)==0).sum() for x in res['coef_sel']]
        res['actual_ntp'] = [(np.array(x)!=0).sum() for x in res['coef_sel']]
        res = res.reset_index()
        res = res.drop(columns=['selected', 'coef_sel', 'target'])

        for k in params:
            res[k] = params[k]
        res['max_corr'] = max_corr
        all_res.append(res)
    return pd.concat(all_res)

In [68]:
n_workers=6
n_uni_pval=1e4
n_fp_estimate=1000
device = 'cuda'

all_res = []
for _ in tqdm(range(2)):
    all_res.append(do_trial(n_workers, n_uni_pval, n_fp_estimate, device))

    dfr = pd.concat(all_res).reset_index(drop=True)
    dfr['95p_n_fp_time'] = dfr['n_FP'].str.split(' ', expand=True)[1].str.split('-', expand=True)[1].str.split(')', expand=True)[0].astype('float32')

  0%|          | 0/2 [00:30<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.68 GiB (GPU 0; 8.00 GiB total capacity; 13.45 GiB already allocated; 0 bytes free; 13.45 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [61]:
dfr

Unnamed: 0,permissiveness,n_selected,n_FP,P_only_FP,actual_nfp,actual_ntp,n_samples,censoring,nb_features,noise,n_informative,effective_rank,max_corr
0,0.01,5,0.0 (0.0-0.0),0.0,0,5,160,0.8,22,1.0,9,2,0.5
1,0.02,5,0.0 (0.0-0.0),0.0,0,5,160,0.8,22,1.0,9,2,0.5
2,0.03,5,0.0 (0.0-0.0),0.0,0,5,160,0.8,22,1.0,9,2,0.5
3,0.04,5,0.1 (0.0-0.78),0.0,0,5,160,0.8,22,1.0,9,2,0.5
4,0.05,7,0.1 (0.0-0.78),0.0,1,6,160,0.8,22,1.0,9,2,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,0.96,8,2.2 (0.0-7.78),0.1,5,3,280,0.6,8,1.0,3,1,1.0
396,0.97,8,2.2 (0.0-7.78),0.1,5,3,280,0.6,8,1.0,3,1,1.0
397,0.98,8,2.2 (0.0-7.78),0.1,5,3,280,0.6,8,1.0,3,1,1.0
398,0.99,8,2.2 (0.0-7.78),0.1,5,3,280,0.6,8,1.0,3,1,1.0


In [63]:
dfr['95p_n_fp_time'] = dfr['n_FP'].str.split(' ', expand=True)[1].str.split('-', expand=True)[1].str.split(')', expand=True)[0].astype('float32')

In [66]:
(dfr['95p_n_fp_time'] > dfr['actual_nfp']).mean()

0.375