This notebook combines all of the testing and training data for every channel combination from the MLP.

In [1]:
import pathlib

import numpy as np
import pandas as pd

In [2]:
cell_type = "PBMC"

In [3]:
# set the input paths
input_data_dir = pathlib.Path("../../results/Multi_Class").resolve(strict=True)
output_data_dir = pathlib.Path(
    f"../../results/Multi_Class/combined_channel_combinations/{cell_type}"
).resolve()
# make output directory if it doesn't exist
output_data_dir.mkdir(parents=True, exist_ok=True)

In [4]:
# get the list of dirs in the input dir
input_dirs = [x for x in input_data_dir.iterdir() if x.is_dir()]
input_dirs = [pathlib.Path(x / cell_type) for x in input_dirs]
# remove the output data from the input dirs
input_dirs = [x for x in input_dirs if x != output_data_dir]

In [5]:
dict_of_result_dfs = {
    "confusion_matrices": [],
    "PR_curves": [],
    "probabilities": [],
    "single_cell_predictions": [],
    "testing_single_cell_predictions": [],
    "training_metrics": [],
    "training_single_cell_predictions": [],
    "validation_single_cell_predictions": [],
}

for channel_combination_path in input_dirs:
    channel_combination_name = (
        str(channel_combination_path).split("/")[-2].strip("MLP_")
    )
    for result_type in dict_of_result_dfs.keys():
        result_df = pd.read_parquet(
            pathlib.Path(channel_combination_path / f"{result_type}.parquet")
        )
        result_df["channel_combination"] = channel_combination_name
        dict_of_result_dfs[result_type].append(result_df)

In [6]:
for result_type, result_dfs in dict_of_result_dfs.items():
    combined_df = pd.concat(result_dfs)
    combined_df.to_parquet(output_data_dir / f"{result_type}.parquet")