In [1]:
# default_exp results_file_handling

In [2]:
#hide
%reload_ext autoreload
%autoreload 2

In [3]:
#export
import re
class ResultstableLocalizer():
    def __init__(self, results_dir):
        self._resultsfilepaths = ResultsFiles(results_dir=results_dir).filepaths
        self.condpairname2file = {}
        self._load_condpairname2file()

    def _load_condpairname2file(self):
        for filepath in self._resultsfilepaths:
            filename = self._parse_condpairname_from_filepath(filepath)
            self.condpairname2file.update({filename: filepath})

    @staticmethod
    def _parse_condpairname_from_filepath(file):
        pattern = "(.*\/|^)(results.*\/)(.*)(.results.tsv)"
        matched = re.search(pattern, file)
        return matched.group(3)


import glob
class ResultsFiles():
    def __init__(self, results_dir):
        self._results_dir = results_dir
        self.filepaths = self.__get_result_filepaths()

    def __get_result_filepaths(self):
        return glob.glob(f'{self._results_dir}/*.results.tsv')
    


### Test results table localization

In [4]:
#hide
import pandas as pd
import os
import shutil
import random
import numpy as np


RESULTS_DIR_SIMULATED = "test_data/unit_tests/results_file_handling/toy_data/results"


class ResultsDirSimulator():
    def __init__(self, results_dir, resultswriterconfig_vec):
        self._results_dir = results_dir
        self._resultswriterconfig_vec = resultswriterconfig_vec
        self._create_toy_results_dirs()
        self._write_out_simulated_dataframes()

    
    def _create_toy_results_dirs(self):
        self._create_or_replace_folder(self._results_dir)
    
    @staticmethod
    def _create_or_replace_folder(folder):
        if os.path.exists(folder):
            shutil.rmtree(folder)
        os.makedirs(folder)

    def _write_out_simulated_dataframes(self):
        for idx, resultswriterconfig in enumerate(self._resultswriterconfig_vec):
            condpairname = f"cond{idx}_VS_pair{idx}" #make up a name for the tables
            resultswriterconfig.filenames_w_config.append(condpairname)
            resutstable_simulator = ResultsTableSimulator(resultswriterconfig)
            self._save_dataframe(resutstable_simulator.protein_df, self._results_dir, condpairname)

    @staticmethod
    def _save_dataframe(df, results_dir, name):
        df.to_csv(f"{results_dir}/{name}.results.tsv", sep = "\t", index = None)


class ResultsTableSimulator():
    def __init__(self, resultswriterconfig):
        self.protein_df = None
        self._resultswriterconfig = resultswriterconfig
        self._simulate_dataframes_as_specified()
    
    def _simulate_dataframes_as_specified(self):
        self.protein_df = self._simulate_protein_df()

    def _simulate_protein_df(self):
        proteins = [f'name{x}' for x in range(self._resultswriterconfig.length)]
        fcs = self._get_fcs()
        fdr = [10**(-np.random.uniform(0, 10)) for x in range(self._resultswriterconfig.length)]
        return pd.DataFrame(data = {'protein' : proteins, 'log2fc' : fcs, 'fdr' : fdr})

    def _get_fcs(self):
        if self._resultswriterconfig.fc_for_all_proteins is not None:
            return [self._resultswriterconfig.fc_for_all_proteins for x in range( self._resultswriterconfig.length)]
        else:
            return [np.random.uniform(low = -2, high= 2) for x in range(self._resultswriterconfig.length)]

class ResultsWriterConfig():
    def __init__(self, length, fc_for_all_proteins = None):
        self.length = length
        self.filenames_w_config = []
        self.fc_for_all_proteins = fc_for_all_proteins




In [5]:
#hide

def test_that_expected_numbers_of_results_files_are_found():
    num_results_files = 50
    simulate_results_files(num_results_files=num_results_files)
    localizer = ResultstableLocalizer(RESULTS_DIR_SIMULATED)
    assert len(set(localizer.condpairname2file.keys())) == num_results_files
    print('performed tests')

def simulate_results_files(num_results_files):
    config1 = ResultsWriterConfig(length = 50)
    config_vec = [config1 for x in range(num_results_files)]
    ResultsDirSimulator(RESULTS_DIR_SIMULATED, config_vec)

test_that_expected_numbers_of_results_files_are_found()

performed tests
