In [1]:
import pathlib

import numpy as np
import pandas as pd
import umap

In [2]:
# paths to data
data_dict = {
    "NF0014": {
        "organoid_feature_selection": {
            "input_path": pathlib.Path(
                "../../data/NF0014/image_based_profiles/3.organoid_fs_profiles.parquet"
            ),
            "output_path": pathlib.Path(
                "../results/NF0014/3.organoid_fs_profiles_umap.parquet"
            ),
        },
        # "sc_feature_selection": {
        #     "input_path": pathlib.Path(
        #         "../../data/NF0014/image_based_profiles/3.sc_fs_profiles.parquet"
        #     ),
        #     "output_path": pathlib.Path(
        #         "../results/NF0014/3.sc_fs_profiles_umap.parquet"
        #     ),
        # },
        # "sc_consensus_profiles": {
        #     "input_path": pathlib.Path(
        #         "../../data/NF0014/image_based_profiles/4.sc_consensus_profiles.parquet"
        #     ),
        #     "output_path": pathlib.Path(
        #         "../results/NF0014/4.sc_consensus_profiles_umap.parquet"
        #     ),
        # },
        # "organoid_consensus_profiles": {
        #     "input_path": pathlib.Path(
        #         "../../data/NF0014/image_based_profiles/4.organoid_consensus_profiles.parquet"
        #     ),
        #     "output_path": pathlib.Path(
        #         "../results/NF0014/4.organoid_consensus_profiles_umap.parquet"
        #     ),
        # },
    }
}
pathlib.Path("../results/NF0014").mkdir(parents=True, exist_ok=True)

In [3]:
metadata_columns = [
    "patient",
    "object_id",
    "unit",
    "dose",
    "treatment",
    "image_set",
    "Well",
    "single_cell_count",
    "parent_organoid",
    "MOA",
]

In [4]:
umap_object = umap.UMAP(
    n_neighbors=15, min_dist=0.1, metric="euclidean", random_state=0
)

for dataset, paths in data_dict.items():
    for profile_type, paths in paths.items():
        print(f"Processing {dataset} - {profile_type}")

        # Load the data
        df = pd.read_parquet(paths["input_path"])
        print(df.shape)
        # remove NaN values
        df = df.dropna(axis=1)
        print(f"Data shape after dropping NaN values: {df.shape}")
        metadata_df = df.copy()
        metadata_subset = []
        for col in metadata_columns:
            if col in df.columns:
                metadata_subset.append(col)

        metadata_df = df[metadata_subset]
        features_df = df.drop(columns=metadata_columns, errors="ignore")

        # Extract features and apply UMAP

        umap_embedding = umap_object.fit_transform(features_df)

        # Create a DataFrame with UMAP results
        umap_df = pd.DataFrame(umap_embedding, columns=["UMAP1", "UMAP2"])
        umap_df = pd.concat([metadata_df.reset_index(drop=True), umap_df], axis=1)
        # Save the UMAP results
        umap_df.to_parquet(paths["output_path"], index=False)

Processing NF0014 - organoid_feature_selection
(102, 157)
Data shape after dropping NaN values: (102, 149)


  warn(


In [6]:
df

Unnamed: 0,patient,object_id,unit,dose,treatment,image_set,Well,MOA,Area.Size.Shape_Organoid_VOLUME,Area.Size.Shape_Organoid_EXTENT,...,Texture_Organoid_BF_Variance_256.1,Texture_Organoid_DNA_Contrast_256.1,Texture_Organoid_DNA_Variance_256.1,Texture_Organoid_ER_Contrast_256.1,Texture_Organoid_ER_Information.Measure.of.Correlation.1_256.1,Texture_Organoid_ER_Sum.Average_256.1,Texture_Organoid_ER_Variance_256.1,Texture_Organoid_Mito_Contrast_256.1,Texture_Organoid_Mito_Sum.Average_256.1,Texture_Organoid_Mito_Variance_256.1
0,NF0014,40,uM,10,Mirdametinib,G8-1,G8,MEK1/MEK2 Inhibitor,-1.718171,-1.664739,...,-1.334719,-1.068404,-1.112068,-0.635342,-2.238332,-1.169085,-0.882218,-0.972143,-1.376009,-0.987831
1,NF0014,24,uM,1,Fimepinostat,D5-1,D5,PI3K and HDAC inhibitor,-2.085979,-2.484067,...,-0.427715,-0.455481,-1.543011,0.074815,5.133408,-1.179782,-0.995430,1.584246,-1.498039,-1.093515
2,NF0014,85,uM,1,Mirdametinib,F8-1,F8,MEK1/MEK2 Inhibitor,0.130074,-0.663134,...,0.068257,-0.455344,-1.153488,0.648688,1.518773,0.786162,-0.364094,-0.886440,-0.923582,-1.193731
3,NF0014,18,nM,10,STAURO,G11-1,G11,Apoptosis,-2.223655,0.265362,...,-1.677969,-1.018121,-1.357958,-0.786237,-2.838483,-1.520782,-1.059963,-0.601197,-1.196517,-0.730073
4,NF0014,24,uM,1,Binimetinib,G7-1,G7,MEK1/MEK2 Inhibitor,-1.728123,-0.369563,...,-0.549648,-0.699480,-0.593020,-0.760946,-0.497278,-1.270120,-1.007393,-0.507220,-0.360881,0.078821
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,NF0014,74,uM,1,Binimetinib,F7-2,F7,MEK1/MEK2 Inhibitor,-0.743143,-1.137280,...,-0.258725,0.354604,0.253493,0.259622,-0.206571,-0.240520,-0.235618,0.689139,-0.056024,0.196205
98,NF0014,38,%,1,DMSO,E4-2,E4,Control,-1.271065,0.901799,...,-0.851513,-0.519411,-0.237923,-0.394669,-1.765124,-0.495324,-0.358874,-0.571201,-0.880099,-0.748066
99,NF0014,105,uM,10,Trametinib,F10-1,F10,MEK1/MEK2 Inhibitor,-1.102919,-2.704329,...,-1.040991,1.002653,-1.014439,3.904562,2.053498,-0.284304,0.063558,1.703842,-0.768823,-0.300368
100,NF0014,42,uM,1,Rapamycin,F5-1,F5,mTOR inhibitor,-1.380469,0.549786,...,-0.978262,-0.503257,-0.133139,-0.765001,-1.389958,-1.349894,-1.024981,-0.433182,-0.724996,-0.373666


In [5]:
umap_df

Unnamed: 0,patient,object_id,unit,dose,treatment,image_set,Well,MOA,UMAP1,UMAP2
0,NF0014,40,uM,10,Mirdametinib,G8-1,G8,MEK1/MEK2 Inhibitor,2.059253,3.060568
1,NF0014,24,uM,1,Fimepinostat,D5-1,D5,PI3K and HDAC inhibitor,0.709571,-0.508804
2,NF0014,85,uM,1,Mirdametinib,F8-1,F8,MEK1/MEK2 Inhibitor,-0.347319,3.271499
3,NF0014,18,nM,10,STAURO,G11-1,G11,Apoptosis,2.493157,1.619611
4,NF0014,24,uM,1,Binimetinib,G7-1,G7,MEK1/MEK2 Inhibitor,1.081486,3.374038
...,...,...,...,...,...,...,...,...,...,...
97,NF0014,74,uM,1,Binimetinib,F7-2,F7,MEK1/MEK2 Inhibitor,-0.157350,3.774975
98,NF0014,38,%,1,DMSO,E4-2,E4,Control,1.096117,5.022879
99,NF0014,105,uM,10,Trametinib,F10-1,F10,MEK1/MEK2 Inhibitor,-1.138737,4.063303
100,NF0014,42,uM,1,Rapamycin,F5-1,F5,mTOR inhibitor,2.043301,2.027698
