In [9]:
#hide
#default_exp utils.organize_experiments
from nbdev.showdoc import *
from dsblocks.utils.nbdev_utils import nbdev_setup, TestRunner

nbdev_setup ()
tst = TestRunner (targets=['dummy'])

# Organize experiments

> Routines for organizing the experiments folders

In [10]:
#export
import pandas as pd
import numpy as np
import shutil

from hpsearch.utils import experiment_utils

In [11]:
#for tests
import os
import pandas as pd
import pytest

from hpsearch.examples.dummy_experiment_manager import generate_data
from hpsearch.config.hpconfig import get_path_experiments

## join_experiments

In [12]:
#export
def join_experiments (path_source, path_destination, key_score=None):
    experiment_data_source = pd.read_pickle ('%s/experiments_data.pk' %path_source)
    experiment_data_destination = pd.read_pickle ('%s/experiments_data.pk' %path_destination)
    experiment_data_source, changed_source = remove_defaults_from_experiment_data (experiment_data_source)
    experiment_data_destination, changed_destination = remove_defaults_from_experiment_data (experiment_data_destination)
    
    for experiment_number_source in range(experiment_data_source.shape[0]):
        path_experiment_source = '%s/experiments/%05d' %(path_source, experiment_number_source)
        parameters_source, _ = pickle.load(open('%s/parameters.pk' %path_experiment_source,'rb'))
        experiment_number_destination, changed_dataframe, _ = experiment_utils.find_rows_with_parameters_dict (experiment_data_destination, parameters_source)
        path_experiment_destination = '%s/experiments/%05d' %(path_destination, experiment_number_destination)
        if changed_dataframe:
            # move folders
            os.rename (path_experiment_source, path_experiment_destination)
            # copy results to dataframe
            missing_cols = [col for col in experiment_data_source.columns if col not in experiment_data_destination.columns]
            for column in missing_cols:
                experiment_data_destination[column] = None
            experiment_data_destination.loc[experiment_number_destination] = experiment_data_source.loc[experiment_number_source]
        else:
            class_ids_source = [int(x) for x in os.listdir(path_experiment_source) if os.path.isdir('%s/%s' %(path_experiment_source, x))]
            class_ids_destination = [int(x) for x in os.listdir(path_experiment_destination) if os.path.isdir('%s/%s' %(path_experiment_destination, x))]
            last_id_destination = max(class_ids_destination)
            
            class_ids_both = [x for x in class_ids_source if x in class_ids_destination]
            class_ids_source = [x for x in class_ids_source if x not in class_ids_both]
            class_ids_destination = [x for x in class_ids_destination if x not in class_ids_both]
            for (idx, class_id_source) in enumerate(class_ids_both):
                if key_score is not None:
                    scores_name = '%d_%s' %(class_id_source, key_score)
                    if experiment_data_source.loc[experiment_number_source, scores_name] != experiment_data_destination.loc[experiment_number_destination, scores_name]:
                        is_new = True
                else:
                    is_new = False
                    scores_name_source = [x for x in experiment_data_source.columns if x.startswith('%d_' %class_id_source)]
                    scores_name_source = [x for x in scores_name_source if not np.isnan(experiment_data_source.loc[experiment_number_source, x])]
                    for scores_name in scores_name_source:
                        if experiment_data_source.loc[experiment_number_source, scores_name] != experiment_data_destination.loc[experiment_number_destination, scores_name]:
                            is_new = True
                            break
                if not is_new:
                    del class_ids_both[idx]
            class_ids_source += class_ids_both
            class_ids_destination += class_ids_both
                
            last_id_source = len(class_ids_source)
            new_ids_destination = range(last_id_destination+1, last_id_destination+last_id_source)
            for (new_id_destination, class_id_source) in zip(new_ids_destination, class_ids_source):
                # move folders
                os.rename ('%s/%d' %(path_experiment_source, class_id_source), '%s/%d' %(path_experiment_destination, new_id_destination))
                # copy results to dataframe
                scores_name_source = [x for x in experiment_data_source.columns if x.startswith('%d_' %class_id_source)]
                scores_name_destination = ['%d_%s' (new_id_destination, x[len('%d_' %class_id_source):]) for x in scores_name_source]
                for score_name_source, score_name_destination in zip(scores_name_source, scores_name_destination):
                    experiment_data_destination.loc[experiment_number_destination, score_name_destination] = experiment_data_source.loc[experiment_number_source, score_name_source]
        
        experiment_data_destination.to_csv ('%s/experiments_data.csv' %path_destination)
        experiment_data_destination.to_pickle ('%s/experiments_data.pk' %path_destination)

## remove_defaults_from_experiment_data

In [13]:
#export
def remove_defaults_from_experiment_data (experiment_data):
    from hpsearch.config.hpconfig import get_default_parameters
    
    experiment_data_original = experiment_data.copy()
    parameters_names = experiment_utils.get_parameters_columns (experiment_data)
    parameters_data = experiment_data_original[parameters_names]
    changed_df = False
    for experiment_number in range(experiment_data.shape[0]):
        good_params = ~(experiment_data.loc[experiment_number, parameters_names].isna()).values
        parameters_names_i = np.array(parameters_names)[good_params]
        parameters_names_i = parameters_names_i.tolist()
        parameters = experiment_data.loc[experiment_number, parameters_names_i].to_dict()

        defaults = get_default_parameters(parameters)
        default_names = [default_name for default_name in defaults.keys() if default_name in parameters_names_i]
        
        for default_name in default_names:
            has_default = experiment_data.loc[experiment_number, default_name] == defaults[default_name]
            if has_default:
                print ('found experiment with default in experiment_number {}, parameter {}, values: {}'.format(experiment_number, default_name, experiment_data.loc[experiment_number, default_name]))
                changed_df = True
                experiment_data.loc[experiment_number, default_name] = None
    
    # check if there are repeated experiments
    if changed_df:
        if experiment_data[parameters_names].duplicated().any():
            print ('duplicated experiments: {}'.format(experiment_data[parameters_names].duplicated()))
            experiment_data = experiment_data_original
            changed_df = False
        
    return experiment_data, changed_df

## remove_experiments

In [14]:
#export
def remove_experiments (experiments=[], root_path=None, root_folder=None):
    from hpsearch.config.hpconfig import get_path_experiment, get_path_experiments
    
    if type(experiments) is not list:
        experiments = [experiments]
    if root_path is None:
        root_path = get_path_experiments(folder = root_folder)
    
    # 1. remove experiments from csv file
    path_csv = f'{root_path}/experiments_data.csv'
    path_pickle = path_csv.replace('csv', 'pk')
    experiment_data = pd.read_pickle (path_pickle)
    experiment_data = experiment_data.drop (index=experiments)
    
    # 2. remove experiments folders
    for experiment in experiments:
        path_experiment = get_path_experiment (experiment, root_path=root_path, root_folder=root_folder)
        shutil.rmtree(path_experiment)
        
    # 3. move experiment folders
    for new_number, original_number in enumerate(experiment_data.index):
        path_new_experiment = get_path_experiment (new_number, root_path=root_path, root_folder=root_folder)
        path_original_experiment = get_path_experiment (original_number, root_path=root_path, root_folder=root_folder)
        if path_new_experiment != path_original_experiment:
            shutil.move (path_original_experiment, path_new_experiment)
            
    # 4. move experiment indexes
    experiment_data.index = range(len(experiment_data.index))
    
    # 5. save experiment data
    experiment_data.to_csv (path_csv)
    experiment_data.to_pickle (path_pickle)

### Usage

In [15]:
#exports tests.utils.test_organize_experiments
def test_remove_experiments ():
    em = generate_data ('remove_experiments')
    
    path_experiments = get_path_experiments ()
    print (f'path_experiments: {path_experiments}\n')
    print (f'experiments content: {os.listdir(path_experiments)}\n')
    print (f'experiments inside: {os.listdir(f"{path_experiments}/experiments")}\n')

    experiments_data = pd.read_pickle (f'{path_experiments}/experiments_data.pk')
    old_experiments_data = experiments_data
    print (f'csv data index {experiments_data.index}\n')
    print ('csv content:')

    remove_experiments (experiments=[3,7])

    # we check that the remaining experiments do not contain number 3 or 7
    experiment_folders = os.listdir(f"{path_experiments}/experiments")
    print (f'experiment folders after removal: {experiment_folders}\n')
    assert len(experiment_folders)==7 and ('00007' not in experiment_folders) and ('00008' not in experiment_folders)

    # we check that neither 3 nor 7 are in the new dataframe index
    experiments_data = pd.read_pickle (f'{path_experiments}/experiments_data.pk')
    print (f'csv data index after removal: {experiments_data.index}\n')
    assert (experiments_data.index==range(7)).all()

    assert (experiments_data.loc[3] == old_experiments_data.loc[4]).all() and (experiments_data.loc[6] == old_experiments_data.loc[8]).all()
    assert (experiments_data.loc[4] == old_experiments_data.loc[5]).all()

    print ('csv content:')
    print (experiments_data)
    
    em.remove_previous_experiments ()

In [16]:
tst.run (test_remove_experiments, tag='dummy')

running test_remove_experiments
path_experiments: test_remove_experiments

experiments content: ['other_parameters.csv', 'git_hash.json', 'experiments_data.csv', 'managers', 'current_experiment_number.pkl', 'experiments', 'parameters.pk', 'parameters.txt', 'experiments_data.pk', 'summary.txt']

experiments inside: ['00000', '00002', '00004', '00007', '00006', '00008', '00003', '00005', '00001']

csv data index RangeIndex(start=0, stop=9, step=1)

csv content:
experiment folders after removal: ['00000', '00002', '00004', '00006', '00003', '00005', '00001']

csv data index after removal: RangeIndex(start=0, stop=7, step=1)

csv content:
   epochs  offset  rate  noise  0_validation_accuracy  0_test_accuracy  \
0     5.0     0.1  0.03    0.1               0.203053         0.404256   
1     5.0     0.3  0.03    0.1               0.483126         0.647555   
2     5.0     0.6  0.03    0.1               0.775755         0.842555   
3    15.0     0.3  0.03    0.1               0.812412        