In [8]:
import sys
import itertools
import shutil

sys.path.append("..")
from FIA.FIA import *
from file_handling import *

In [9]:
start = "/mnt/d" #"D:" # /mnt/d
paths = {"start":   [f"{start}/runs/FIA/"],
         "sub":     [ "Com20_single", "Com20_val", "Com20_repl", '20220620_P0003_S6-S22_FIA-TOF',
                     '20220726_FIA-TOF_Ex0005_msAG24-msAg77', '20220817_P0008_S13-S96_FIA-TOF', '20221221_P0008_S97-S156_FIA-TOF', '20230307_P0017_S1-S15_FIA-TOF',
                     '20230505_P0017_S16-S135_FI-TOF', '20230515_P0008_Plasma_Randomized', '20230515_P0008_Serum_Randomized', '20230516_P0008_Gewebe_Randomized',
                     '20230629_P0018_S01-S12_FI-TOF', '20230822_P0031_S1-S66_FI-TOF', '20230927_P0031_Decay_DHA_FI-TOF', '20230927_P0031_S001-S066_FI-TOF_1to100_wdh',
                     '20231121_P0032_FI-TOF', '20240206_P0038_S01-S12_FI-TOF', '20240216_P0032_plate10_SXX-SXX_FI-TOF', '20240327_TestAfterRelocation_FI-TOF'],
         "subsub":  ["merged"] }
path_combs = [os.path.normpath(os.path.join(*path)) for path in itertools.product(*paths.values())]
outpath = os.path.normpath(f"{start}/runs/FIA/all_x_com8/merged")

In [13]:
def read_df(path, framework=pd):
    print(f"Loading: {path}")
    if path.endswith(".parquet"):
        df = framework.read_parquet( path )
    elif path.endswith(".tsv"):
        if framework == pl:
            df = framework.read_csv( path, separator="\t" )
        elif framework == pd:
            df = framework.read_csv( path, sep="\t", index_col="mz")
    elif path.endswith(".feather"):
        df = framework.read_feather( path )
    return df

def write_df(df, path, framework=pd):
    if framework == pl:
        if path.endswith(".parquet"):
            df.write_parquet( path )
        elif path.endswith(".tsv"):
            df.write_csv( path, separator="\t" )
    elif framework == pd:
        if path.endswith(".parquet"):
            df.to_parquet( path )
        elif path.endswith(".tsv"):
            df.to_csv( path, sep="\t" )
        elif path.endswith(".feather"):
            df.to_feather( path )

def concat_dfs(dfs, framework=pd):
    if framework == pl:
        dfs = framework.concat( dfs, how="align" )
    elif framework == pd:
        dfs = framework.concat( dfs, axis="columns")
    print(dfs.shape)
    return dfs

def combine_dc(path_combs, outpath, target_format="parquet", framework=pl, bins:int=2):
    if len(path_combs) == 1:
        if path_combs[0].endswith(target_format):
            shutil.copy(path_combs[0], os.path.join(outpath, f"data_matrix.{target_format}"))
        else:
            binned_df = read_df( path_combs[0], framework=framework)
            write_df(binned_df, os.path.join(outpath, f"data_matrix.{target_format}"), framework=framework)
    
    else:
        tmp_dir = os.path.join(outpath, "tmp")
        if not os.path.isdir(tmp_dir):
            os.mkdir(tmp_dir)

        tmp_paths = []
        for i, path in enumerate(tqdm(path_combs)):
            file = "" if os.path.isfile(path) else "data_matrix.tsv"
            path =  os.path.normpath(os.path.join(path, file))
            split = str(os.path.basename(path)).split(".")
            target_file = f'{".".join( split[:-1] )}_{i}.{target_format}'
            tmp_paths.append(os.path.join(tmp_dir, target_file))

        binned_dfs = []
        new_path_combs = []
        for i, path in enumerate(tqdm(path_combs)):
            check_previous_runs = [os.path.isfile(tmp_path) for tmp_path in tmp_paths[i:np.min([i + bins, len(tmp_paths) - 1])]]
            if True not in check_previous_runs:
                binned_df = read_df(path, framework=framework)
                binned_dfs.append( binned_df )
        
                if len(binned_dfs) >= bins:
                    binned_dfs = concat_dfs(binned_dfs, framework=framework)
                    write_df(binned_dfs, tmp_paths[i], framework=framework)
                    new_path_combs.append( tmp_paths[i] )
                    binned_dfs = []
                
        if binned_dfs and not os.path.isfile(tmp_paths[i]):
            binned_dfs = concat_dfs(binned_dfs, framework=framework)
            write_df(binned_dfs, tmp_paths[i], framework=framework)
            new_path_combs.append( tmp_paths[i] )
    
        print(new_path_combs)
        combine_dc(new_path_combs, outpath, target_format=target_format, framework=framework)

In [15]:
combine_dc([os.path.join(outpath, file) for file in os.listdir(outpath) if file.endswith(".tsv")], outpath, target_format="tsv", framework=pl, bins=2)

100%|██████████| 20/20 [00:00<00:00, 1044.74it/s]
  0%|          | 0/20 [00:00<?, ?it/s]

Loading: /mnt/d/runs/FIA/all_x_com8/merged/data_matrix_12.tsv


 25%|██▌       | 5/20 [00:32<01:38,  6.59s/it]

Loading: /mnt/d/runs/FIA/all_x_com8/merged/data_matrix_13.tsv
(825000, 177)


 30%|███       | 6/20 [01:54<05:24, 23.18s/it]

Loading: /mnt/d/runs/FIA/all_x_com8/merged/data_matrix_14.tsv


 35%|███▌      | 7/20 [02:13<04:47, 22.14s/it]

Loading: /mnt/d/runs/FIA/all_x_com8/merged/data_matrix_15.tsv


In [7]:
for i, path in enumerate(tqdm(path_combs)):
    shutil.copy(os.path.join(path, "data_matrix.tsv"), os.path.join(outpath, f"data_matrix_{i}.tsv"))

100%|██████████| 20/20 [06:07<00:00, 18.37s/it]


In [None]:
binned_dfs = []
for path in path_combs:
    print(f"Path: {path}")
    binned_df = pl.read_csv(os.path.join(path, "data_matrix.tsv"), separator="\t")
    binned_dfs.append( binned_df )

Path: /mnt/d/runs/FIA/Com20_single/merged
Path: /mnt/d/runs/FIA/Com20_val/merged
Path: /mnt/d/runs/FIA/Com8_equal_conc_comb/merged
Path: /mnt/d/runs/FIA/Com8_grown_together/merged
Path: /mnt/d/runs/FIA/Com20_repl/merged
Path: /mnt/d/runs/FIA/20220620_P0003_S6-S22_FIA-TOF/merged
Path: /mnt/d/runs/FIA/20220726_FIA-TOF_Ex0005_msAG24-msAg77/merged
Path: /mnt/d/runs/FIA/20220817_P0008_S13-S96_FIA-TOF/merged
Path: /mnt/d/runs/FIA/20221221_P0008_S97-S156_FIA-TOF/merged
Path: /mnt/d/runs/FIA/20230307_P0017_S1-S15_FIA-TOF/merged
Path: /mnt/d/runs/FIA/20230505_P0017_S16-S135_FI-TOF/merged
Path: /mnt/d/runs/FIA/20230515_P0008_Plasma_Randomized/merged
Path: /mnt/d/runs/FIA/20230515_P0008_Serum_Randomized/merged
Path: /mnt/d/runs/FIA/20230516_P0008_Gewebe_Randomized/merged
Path: /mnt/d/runs/FIA/20230629_P0018_S01-S12_FI-TOF/merged
Path: /mnt/d/runs/FIA/20230822_P0031_S1-S66_FI-TOF/merged
