In [1]:
import pathlib

import pandas as pd
import umap

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False

    # Get the current working directory
cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd

else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break

# Check if a Git root directory was found
if root_dir is None:
    raise FileNotFoundError("No Git root directory found.")

In [2]:
# paths to data
data_dict = {
    "sc": {
        "input": pathlib.Path(
            f"{root_dir}/data/all_patient_profiles/sc_profiles.parquet"
        ).resolve(strict=True),
        "output": pathlib.Path(f"{root_dir}/5.EDA/results/sc_umap.parquet").resolve(),
    },
    "sc_fs": {
        "input": pathlib.Path(
            f"{root_dir}/data/all_patient_profiles/sc_fs_profiles.parquet"
        ).resolve(strict=True),
        "output": pathlib.Path(
            f"{root_dir}/5.EDA/results/sc_fs_umap.parquet"
        ).resolve(),
    },
    "sc_agg": {
        "input": pathlib.Path(
            f"{root_dir}/data/all_patient_profiles/sc_agg_profiles.parquet"
        ).resolve(strict=True),
        "output": pathlib.Path(
            f"{root_dir}/5.EDA/results/sc_agg_umap.parquet"
        ).resolve(),
    },
    "organoid": {
        "input": pathlib.Path(
            f"{root_dir}/data/all_patient_profiles/organoid_profiles.parquet"
        ).resolve(strict=True),
        "output": pathlib.Path(
            f"{root_dir}/5.EDA/results/organoid_umap.parquet"
        ).resolve(),
    },
    "organoid_fs": {
        "input": pathlib.Path(
            f"{root_dir}/data/all_patient_profiles/organoid_fs_profiles.parquet"
        ).resolve(strict=True),
        "output": pathlib.Path(
            f"{root_dir}/5.EDA/results/organoid_fs_umap.parquet"
        ).resolve(),
    },
    "organoid_agg": {
        "input": pathlib.Path(
            f"{root_dir}/data/all_patient_profiles/organoid_agg_profiles.parquet"
        ).resolve(strict=True),
        "output": pathlib.Path(
            f"{root_dir}/5.EDA/results/organoid_agg_umap.parquet"
        ).resolve(),
    },
    "sc_consensus": {
        "input": pathlib.Path(
            f"{root_dir}/data/all_patient_profiles/sc_consensus_profiles.parquet"
        ).resolve(strict=True),
        "output": pathlib.Path(
            f"{root_dir}/5.EDA/results/sc_consensus_umap.parquet"
        ).resolve(),
    },
    "organoid_consensus": {
        "input": pathlib.Path(
            f"{root_dir}/data/all_patient_profiles/organoid_consensus_profiles.parquet"
        ).resolve(strict=True),
        "output": pathlib.Path(
            f"{root_dir}/5.EDA/results/organoid_consensus_umap.parquet"
        ).resolve(),
    },
}

data_dict["organoid"]["output"].parent.mkdir(parents=True, exist_ok=True)

In [3]:
metadata_columns = [
    "patient",
    "object_id",
    "unit",
    "dose",
    "treatment",
    "image_set",
    "Well",
    "single_cell_count",
    "parent_organoid",
    "Treatment",
    "Target",
    "Function",
    "Class",
    "Therapeutic Categories",
]

In [4]:
umap_object = umap.UMAP(
    n_neighbors=15, min_dist=0.1, metric="euclidean", random_state=0
)

for dataset, paths in data_dict.items():
    # Load the data
    df = pd.read_parquet(data_dict[dataset]["input"])

    metadata_df = df.copy()
    metadata_subset = []
    for col in metadata_columns:
        if col in df.columns:
            metadata_subset.append(col)

    metadata_df = df[metadata_subset]
    features_df = df.drop(columns=metadata_columns, errors="ignore")
    print(features_df.shape)
    # remove NaN values
    features_df = features_df.dropna(axis=0, how="any")
    print(f"Data shape after dropping NaN values: {features_df.shape}")
    # Extract features and apply UMAP

    umap_embedding = umap_object.fit_transform(features_df)

    # Create a DataFrame with UMAP results
    umap_df = pd.DataFrame(umap_embedding, columns=["UMAP1", "UMAP2"])
    umap_df = pd.concat([metadata_df.reset_index(drop=True), umap_df], axis=1)
    # Save the UMAP results
    umap_df.to_parquet(data_dict[dataset]["output"], index=False)

(11232, 1922)
Data shape after dropping NaN values: (2391, 1922)


  warn(


(11232, 314)
Data shape after dropping NaN values: (10703, 314)




(395, 314)
Data shape after dropping NaN values: (394, 314)




(1481, 640)
Data shape after dropping NaN values: (508, 640)




(1481, 167)
Data shape after dropping NaN values: (1427, 167)




(392, 167)
Data shape after dropping NaN values: (390, 167)




(171, 314)
Data shape after dropping NaN values: (171, 314)
(172, 167)
Data shape after dropping NaN values: (172, 167)




Individual umaps

In [5]:
patients = pd.read_csv(
    pathlib.Path(f"{root_dir}/data/patient_IDs.txt").resolve(strict=True),
    header=None,
    names=["patient"],
)["patient"].to_list()

In [6]:
file_dict = {}
for patient in patients:
    file_dict[patient] = {
        "fs": {
            "sc": {
                "input": pathlib.Path(
                    f"{root_dir}/data/{patient}/image_based_profiles/4.feature_selected_profiles/sc_fs.parquet"
                ).resolve(strict=True),
                "output": pathlib.Path(
                    f"{root_dir}/5.EDA/results/patient_results/{patient}_sc_fs_umap.parquet"
                ).resolve(),
            },
            "organoid": {
                "input": pathlib.Path(
                    f"{root_dir}/data/{patient}/image_based_profiles/4.feature_selected_profiles/organoid_fs.parquet"
                ).resolve(strict=True),
                "output": pathlib.Path(
                    f"{root_dir}/5.EDA/results/patient_results/{patient}_organoid_fs_umap.parquet"
                ).resolve(),
            },
        },
        "agg": {
            "sc_parent_organoid_level": {
                "input": pathlib.Path(
                    f"{root_dir}/data/{patient}/image_based_profiles/5.aggregated_profiles/sc_agg_parent_organoid_level.parquet"
                ).resolve(strict=True),
                "output": pathlib.Path(
                    f"{root_dir}/5.EDA/results/patient_results/{patient}_sc_agg_parent_organoid_level_umap.parquet"
                ).resolve(),
            },
            "sc_well_level": {
                "input": pathlib.Path(
                    f"{root_dir}/data/{patient}/image_based_profiles/5.aggregated_profiles/sc_agg_well_level.parquet"
                ).resolve(strict=True),
                "output": pathlib.Path(
                    f"{root_dir}/5.EDA/results/patient_results/{patient}_sc_agg_well_level_umap.parquet"
                ).resolve(),
            },
            "sc_consensus": {
                "input": pathlib.Path(
                    f"{root_dir}/data/{patient}/image_based_profiles/5.aggregated_profiles/sc_consensus.parquet"
                ).resolve(strict=True),
                "output": pathlib.Path(
                    f"{root_dir}/5.EDA/results/patient_results/{patient}_sc_consensus_umap.parquet"
                ).resolve(),
            },
            "organoid_well_level": {
                "input": pathlib.Path(
                    f"{root_dir}/data/{patient}/image_based_profiles/5.aggregated_profiles/organoid_agg_well_level.parquet"
                ).resolve(strict=True),
                "output": pathlib.Path(
                    f"{root_dir}/5.EDA/results/patient_results/{patient}_organoid_agg_well_level_umap.parquet"
                ).resolve(),
            },
            "organoid_consensus": {
                "input": pathlib.Path(
                    f"{root_dir}/data/{patient}/image_based_profiles/5.aggregated_profiles/organoid_consensus.parquet"
                ).resolve(strict=True),
                "output": pathlib.Path(
                    f"{root_dir}/5.EDA/results/patient_results/{patient}_organoid_consensus_umap.parquet"
                ).resolve(),
            },
        },
    }

In [7]:
for patient in file_dict.keys():
    for level in file_dict[patient].keys():
        for profile_type in file_dict[patient][level].keys():
            for dataset, paths in file_dict[patient][level][profile_type].items():
                print(f"Processing {patient} - {level} - {profile_type} - {dataset}")
                df = pd.read_parquet(file_dict[patient][level][profile_type]["input"])

                metadata_df = df.copy()
                metadata_subset = []
                for col in metadata_columns:
                    if col in df.columns:
                        metadata_subset.append(col)

                metadata_df = df[metadata_subset]
                features_df = df.drop(columns=metadata_columns, errors="ignore")
                print(features_df.shape)
                # remove NaN values
                features_df = features_df.dropna(axis=0, how="any")
                print(f"Data shape after dropping NaN values: {features_df.shape}")
                # Extract features and apply UMAP

                umap_embedding = umap_object.fit_transform(features_df)

                # Create a DataFrame with UMAP results
                umap_df = pd.DataFrame(umap_embedding, columns=["UMAP1", "UMAP2"])
                umap_df = pd.concat(
                    [metadata_df.reset_index(drop=True), umap_df], axis=1
                )
                # Save the UMAP results
                file_dict[patient][level][profile_type]["output"].parent.mkdir(
                    parents=True, exist_ok=True
                )
                umap_df.to_parquet(
                    file_dict[patient][level][profile_type]["output"], index=False
                )

Processing NF0014 - fs - sc - input
(1401, 200)
Data shape after dropping NaN values: (1336, 200)




Processing NF0014 - fs - sc - output
(1401, 200)
Data shape after dropping NaN values: (1336, 200)




Processing NF0014 - fs - organoid - input
(101, 143)
Data shape after dropping NaN values: (98, 143)
Processing NF0014 - fs - organoid - output
(101, 143)
Data shape after dropping NaN values: (98, 143)
Processing NF0014 - agg - sc_parent_organoid_level - input
(117, 200)
Data shape after dropping NaN values: (116, 200)




Processing NF0014 - agg - sc_parent_organoid_level - output
(117, 200)
Data shape after dropping NaN values: (116, 200)
Processing NF0014 - agg - sc_well_level - input
(49, 200)
Data shape after dropping NaN values: (49, 200)
Processing NF0014 - agg - sc_well_level - output
(49, 200)
Data shape after dropping NaN values: (49, 200)




Processing NF0014 - agg - sc_consensus - input
(17, 200)
Data shape after dropping NaN values: (17, 200)
Processing NF0014 - agg - sc_consensus - output
(17, 200)
Data shape after dropping NaN values: (17, 200)
Processing NF0014 - agg - organoid_well_level - input
(49, 143)
Data shape after dropping NaN values: (49, 143)
Processing NF0014 - agg - organoid_well_level - output
(49, 143)
Data shape after dropping NaN values: (49, 143)
Processing NF0014 - agg - organoid_consensus - input
(17, 143)
Data shape after dropping NaN values: (17, 143)
Processing NF0014 - agg - organoid_consensus - output




(17, 143)
Data shape after dropping NaN values: (17, 143)
Processing NF0016 - fs - sc - input
(376, 240)
Data shape after dropping NaN values: (362, 240)




Processing NF0016 - fs - sc - output
(376, 240)
Data shape after dropping NaN values: (362, 240)




Processing NF0016 - fs - organoid - input
(85, 139)
Data shape after dropping NaN values: (80, 139)
Processing NF0016 - fs - organoid - output
(85, 139)
Data shape after dropping NaN values: (80, 139)
Processing NF0016 - agg - sc_parent_organoid_level - input
(79, 240)
Data shape after dropping NaN values: (77, 240)




Processing NF0016 - agg - sc_parent_organoid_level - output
(79, 240)
Data shape after dropping NaN values: (77, 240)
Processing NF0016 - agg - sc_well_level - input
(43, 240)
Data shape after dropping NaN values: (42, 240)
Processing NF0016 - agg - sc_well_level - output
(43, 240)
Data shape after dropping NaN values: (42, 240)
Processing NF0016 - agg - sc_consensus - input
(16, 240)
Data shape after dropping NaN values: (16, 240)




Processing NF0016 - agg - sc_consensus - output
(16, 240)
Data shape after dropping NaN values: (16, 240)
Processing NF0016 - agg - organoid_well_level - input
(43, 139)
Data shape after dropping NaN values: (42, 139)
Processing NF0016 - agg - organoid_well_level - output
(43, 139)
Data shape after dropping NaN values: (42, 139)
Processing NF0016 - agg - organoid_consensus - input
(16, 139)
Data shape after dropping NaN values: (16, 139)
Processing NF0016 - agg - organoid_consensus - output
(16, 139)
Data shape after dropping NaN values: (16, 139)
Processing NF0018 - fs - sc - input
(624, 217)
Data shape after dropping NaN values: (598, 217)




Processing NF0018 - fs - sc - output
(624, 217)
Data shape after dropping NaN values: (598, 217)




Processing NF0018 - fs - organoid - input
(128, 148)
Data shape after dropping NaN values: (124, 148)
Processing NF0018 - fs - organoid - output
(128, 148)
Data shape after dropping NaN values: (124, 148)




Processing NF0018 - agg - sc_parent_organoid_level - input
(108, 217)
Data shape after dropping NaN values: (108, 217)
Processing NF0018 - agg - sc_parent_organoid_level - output
(108, 217)
Data shape after dropping NaN values: (108, 217)




Processing NF0018 - agg - sc_well_level - input
(44, 217)
Data shape after dropping NaN values: (44, 217)
Processing NF0018 - agg - sc_well_level - output
(44, 217)
Data shape after dropping NaN values: (44, 217)
Processing NF0018 - agg - sc_consensus - input
(16, 217)
Data shape after dropping NaN values: (16, 217)
Processing NF0018 - agg - sc_consensus - output
(16, 217)
Data shape after dropping NaN values: (16, 217)
Processing NF0018 - agg - organoid_well_level - input
(47, 148)
Data shape after dropping NaN values: (47, 148)




Processing NF0018 - agg - organoid_well_level - output
(47, 148)
Data shape after dropping NaN values: (47, 148)
Processing NF0018 - agg - organoid_consensus - input
(17, 148)
Data shape after dropping NaN values: (17, 148)
Processing NF0018 - agg - organoid_consensus - output
(17, 148)
Data shape after dropping NaN values: (17, 148)
Processing NF0021 - fs - sc - input
(2716, 234)
Data shape after dropping NaN values: (2643, 234)




Processing NF0021 - fs - sc - output
(2716, 234)
Data shape after dropping NaN values: (2643, 234)




Processing NF0021 - fs - organoid - input
(270, 147)
Data shape after dropping NaN values: (267, 147)




Processing NF0021 - fs - organoid - output
(270, 147)
Data shape after dropping NaN values: (267, 147)




Processing NF0021 - agg - sc_parent_organoid_level - input
(263, 234)
Data shape after dropping NaN values: (262, 234)




Processing NF0021 - agg - sc_parent_organoid_level - output
(263, 234)
Data shape after dropping NaN values: (262, 234)




Processing NF0021 - agg - sc_well_level - input
(50, 234)
Data shape after dropping NaN values: (50, 234)
Processing NF0021 - agg - sc_well_level - output
(50, 234)
Data shape after dropping NaN values: (50, 234)
Processing NF0021 - agg - sc_consensus - input
(17, 234)
Data shape after dropping NaN values: (17, 234)
Processing NF0021 - agg - sc_consensus - output
(17, 234)
Data shape after dropping NaN values: (17, 234)
Processing NF0021 - agg - organoid_well_level - input
(50, 147)
Data shape after dropping NaN values: (50, 147)




Processing NF0021 - agg - organoid_well_level - output
(50, 147)
Data shape after dropping NaN values: (50, 147)
Processing NF0021 - agg - organoid_consensus - input
(17, 147)
Data shape after dropping NaN values: (17, 147)
Processing NF0021 - agg - organoid_consensus - output
(17, 147)
Data shape after dropping NaN values: (17, 147)
Processing NF0030 - fs - sc - input
(1328, 231)
Data shape after dropping NaN values: (1284, 231)




Processing NF0030 - fs - sc - output
(1328, 231)
Data shape after dropping NaN values: (1284, 231)




Processing NF0030 - fs - organoid - input
(84, 135)
Data shape after dropping NaN values: (82, 135)
Processing NF0030 - fs - organoid - output
(84, 135)
Data shape after dropping NaN values: (82, 135)
Processing NF0030 - agg - sc_parent_organoid_level - input
(98, 231)
Data shape after dropping NaN values: (98, 231)




Processing NF0030 - agg - sc_parent_organoid_level - output
(98, 231)
Data shape after dropping NaN values: (98, 231)
Processing NF0030 - agg - sc_well_level - input
(49, 231)
Data shape after dropping NaN values: (49, 231)
Processing NF0030 - agg - sc_well_level - output
(49, 231)
Data shape after dropping NaN values: (49, 231)




Processing NF0030 - agg - sc_consensus - input
(17, 231)
Data shape after dropping NaN values: (17, 231)
Processing NF0030 - agg - sc_consensus - output
(17, 231)
Data shape after dropping NaN values: (17, 231)
Processing NF0030 - agg - organoid_well_level - input
(43, 135)
Data shape after dropping NaN values: (43, 135)
Processing NF0030 - agg - organoid_well_level - output
(43, 135)
Data shape after dropping NaN values: (43, 135)
Processing NF0030 - agg - organoid_consensus - input
(17, 135)
Data shape after dropping NaN values: (17, 135)
Processing NF0030 - agg - organoid_consensus - output
(17, 135)
Data shape after dropping NaN values: (17, 135)
Processing NF0040 - fs - sc - input




(2858, 212)
Data shape after dropping NaN values: (2772, 212)
Processing NF0040 - fs - sc - output
(2858, 212)
Data shape after dropping NaN values: (2772, 212)




Processing NF0040 - fs - organoid - input
(386, 138)
Data shape after dropping NaN values: (378, 138)




Processing NF0040 - fs - organoid - output
(386, 138)
Data shape after dropping NaN values: (378, 138)




Processing NF0040 - agg - sc_parent_organoid_level - input
(342, 212)
Data shape after dropping NaN values: (339, 212)




Processing NF0040 - agg - sc_parent_organoid_level - output
(342, 212)
Data shape after dropping NaN values: (339, 212)




Processing NF0040 - agg - sc_well_level - input
(60, 212)
Data shape after dropping NaN values: (60, 212)
Processing NF0040 - agg - sc_well_level - output
(60, 212)
Data shape after dropping NaN values: (60, 212)
Processing NF0040 - agg - sc_consensus - input
(22, 212)
Data shape after dropping NaN values: (22, 212)
Processing NF0040 - agg - sc_consensus - output
(22, 212)
Data shape after dropping NaN values: (22, 212)
Processing NF0040 - agg - organoid_well_level - input




(60, 138)
Data shape after dropping NaN values: (60, 138)
Processing NF0040 - agg - organoid_well_level - output
(60, 138)
Data shape after dropping NaN values: (60, 138)
Processing NF0040 - agg - organoid_consensus - input
(22, 138)
Data shape after dropping NaN values: (22, 138)
Processing NF0040 - agg - organoid_consensus - output
(22, 138)
Data shape after dropping NaN values: (22, 138)
Processing SARCO219 - fs - sc - input
(426, 214)
Data shape after dropping NaN values: (418, 214)




Processing SARCO219 - fs - sc - output
(426, 214)
Data shape after dropping NaN values: (418, 214)




Processing SARCO219 - fs - organoid - input
(198, 146)
Data shape after dropping NaN values: (187, 146)
Processing SARCO219 - fs - organoid - output
(198, 146)
Data shape after dropping NaN values: (187, 146)




Processing SARCO219 - agg - sc_parent_organoid_level - input
(133, 214)
Data shape after dropping NaN values: (132, 214)
Processing SARCO219 - agg - sc_parent_organoid_level - output
(133, 214)
Data shape after dropping NaN values: (132, 214)




Processing SARCO219 - agg - sc_well_level - input
(50, 214)
Data shape after dropping NaN values: (50, 214)
Processing SARCO219 - agg - sc_well_level - output
(50, 214)
Data shape after dropping NaN values: (50, 214)
Processing SARCO219 - agg - sc_consensus - input
(17, 214)
Data shape after dropping NaN values: (17, 214)
Processing SARCO219 - agg - sc_consensus - output
(17, 214)
Data shape after dropping NaN values: (17, 214)
Processing SARCO219 - agg - organoid_well_level - input
(50, 146)
Data shape after dropping NaN values: (50, 146)




Processing SARCO219 - agg - organoid_well_level - output
(50, 146)
Data shape after dropping NaN values: (50, 146)
Processing SARCO219 - agg - organoid_consensus - input
(17, 146)
Data shape after dropping NaN values: (17, 146)
Processing SARCO219 - agg - organoid_consensus - output
(17, 146)
Data shape after dropping NaN values: (17, 146)
Processing SARCO361 - fs - sc - input
(1503, 193)
Data shape after dropping NaN values: (1499, 193)




Processing SARCO361 - fs - sc - output
(1503, 193)
Data shape after dropping NaN values: (1499, 193)




Processing SARCO361 - fs - organoid - input
(229, 143)
Data shape after dropping NaN values: (225, 143)
Processing SARCO361 - fs - organoid - output




(229, 143)
Data shape after dropping NaN values: (225, 143)
Processing SARCO361 - agg - sc_parent_organoid_level - input
(178, 193)
Data shape after dropping NaN values: (178, 193)
Processing SARCO361 - agg - sc_parent_organoid_level - output
(178, 193)
Data shape after dropping NaN values: (178, 193)




Processing SARCO361 - agg - sc_well_level - input
(50, 193)
Data shape after dropping NaN values: (50, 193)
Processing SARCO361 - agg - sc_well_level - output
(50, 193)
Data shape after dropping NaN values: (50, 193)
Processing SARCO361 - agg - sc_consensus - input
(17, 193)
Data shape after dropping NaN values: (17, 193)
Processing SARCO361 - agg - sc_consensus - output
(17, 193)
Data shape after dropping NaN values: (17, 193)
Processing SARCO361 - agg - organoid_well_level - input
(50, 143)
Data shape after dropping NaN values: (50, 143)




Processing SARCO361 - agg - organoid_well_level - output
(50, 143)
Data shape after dropping NaN values: (50, 143)
Processing SARCO361 - agg - organoid_consensus - input
(17, 143)
Data shape after dropping NaN values: (17, 143)
Processing SARCO361 - agg - organoid_consensus - output
(17, 143)
Data shape after dropping NaN values: (17, 143)


