In [1]:
import pathlib

import pandas as pd
import umap

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False

    # Get the current working directory
cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd

else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break

# Check if a Git root directory was found
if root_dir is None:
    raise FileNotFoundError("No Git root directory found.")

In [2]:
# paths to data
data_dict = {
    "sc": {
        "input": pathlib.Path(
            f"{root_dir}/data/all_patient_profiles/sc_profiles.parquet"
        ).resolve(strict=True),
        "output": pathlib.Path(f"{root_dir}/5.EDA/results/sc_umap.parquet").resolve(),
    },
    "sc_fs": {
        "input": pathlib.Path(
            f"{root_dir}/data/all_patient_profiles/sc_fs_profiles.parquet"
        ).resolve(strict=True),
        "output": pathlib.Path(
            f"{root_dir}/5.EDA/results/sc_fs_umap.parquet"
        ).resolve(),
    },
    "sc_agg": {
        "input": pathlib.Path(
            f"{root_dir}/data/all_patient_profiles/sc_agg_profiles.parquet"
        ).resolve(strict=True),
        "output": pathlib.Path(
            f"{root_dir}/5.EDA/results/sc_agg_umap.parquet"
        ).resolve(),
    },
    "organoid": {
        "input": pathlib.Path(
            f"{root_dir}/data/all_patient_profiles/organoid_profiles.parquet"
        ).resolve(strict=True),
        "output": pathlib.Path(
            f"{root_dir}/5.EDA/results/organoid_umap.parquet"
        ).resolve(),
    },
    "organoid_fs": {
        "input": pathlib.Path(
            f"{root_dir}/data/all_patient_profiles/organoid_fs_profiles.parquet"
        ).resolve(strict=True),
        "output": pathlib.Path(
            f"{root_dir}/5.EDA/results/organoid_fs_umap.parquet"
        ).resolve(),
    },
    "organoid_agg": {
        "input": pathlib.Path(
            f"{root_dir}/data/all_patient_profiles/organoid_agg_profiles.parquet"
        ).resolve(strict=True),
        "output": pathlib.Path(
            f"{root_dir}/5.EDA/results/organoid_agg_umap.parquet"
        ).resolve(),
    },
}

In [11]:
metadata_columns = [
    "patient",
    "object_id",
    "unit",
    "dose",
    "treatment",
    "image_set",
    "Well",
    "single_cell_count",
    "parent_organoid",
    "Treatment",
    "Pathway",
    "Function",
    "Class",
    "Therapeutic Categories",
]

In [5]:
umap_object = umap.UMAP(
    n_neighbors=15, min_dist=0.1, metric="euclidean", random_state=0
)

for dataset, paths in data_dict.items():
    # Load the data
    df = pd.read_parquet(data_dict[dataset]["input"])

    metadata_df = df.copy()
    metadata_subset = []
    for col in metadata_columns:
        if col in df.columns:
            metadata_subset.append(col)

    metadata_df = df[metadata_subset]
    features_df = df.drop(columns=metadata_columns, errors="ignore")
    print(features_df.shape)
    # remove NaN values
    features_df = features_df.dropna(axis=1)
    print(f"Data shape after dropping NaN values: {features_df.shape}")
    # Extract features and apply UMAP

    umap_embedding = umap_object.fit_transform(features_df)

    # Create a DataFrame with UMAP results
    umap_df = pd.DataFrame(umap_embedding, columns=["UMAP1", "UMAP2"])
    umap_df = pd.concat([metadata_df.reset_index(drop=True), umap_df], axis=1)
    # Save the UMAP results
    umap_df.to_parquet(data_dict[dataset]["output"], index=False)

(11232, 1922)
Data shape after dropping NaN values: (11232, 1568)


  warn(


(11232, 314)
Data shape after dropping NaN values: (11232, 271)




(64, 314)
Data shape after dropping NaN values: (64, 314)




(1481, 650)
Data shape after dropping NaN values: (1481, 8)




ValueError: could not convert string to float: 'Mirdametinib'

In [12]:
df = pd.read_parquet(data_dict[dataset]["input"])

metadata_df = df.copy()
metadata_subset = []
for col in metadata_columns:
    if col in df.columns:
        metadata_subset.append(col)

metadata_df = df[metadata_subset]
features_df = df.drop(columns=metadata_columns, errors="ignore")
print(features_df.shape)
# remove NaN values
# features_df = features_df.dropna(axis=1)
# print(f"Data shape after dropping NaN values: {features_df.shape}")
# Extract features and apply UMAP

(1481, 650)


In [15]:
df

Unnamed: 0,patient,object_id,unit,dose,treatment,image_set,Well,single_cell_count,Colocalization_Organoid_AGP.BF_MEAN.CORRELATION.COEFF,Colocalization_Organoid_AGP.BF_MEDIAN.CORRELATION.COEFF,...,Treatment_x,Pathway_x,Function_x,Class_x,Therapeutic Categories_x,Treatment_y,Pathway_y,Function_y,Class_y,Therapeutic Categories_y
0,NF0014,40,uM,10,Mirdametinib,G8-1,G8,8.0,-0.777016,-0.777016,...,Mirdametinib,MEK1/2,Inhibits MEK1/2,Small Molecule,Kinase Inhibitor,Mirdametinib,MEK1/2,Inhibits MEK1/2,Small Molecule,Kinase Inhibitor
1,NF0014,24,uM,1,Fimepinostat,D5-1,D5,5.0,1.217722,1.217722,...,Fimepinostat,mTOR;NF-kB; MEK1/2,Inhibits PI3K,Small Molecule,Investigational,Fimepinostat,mTOR;NF-kB; MEK1/2,Inhibits PI3K,Small Molecule,Investigational
2,NF0014,85,uM,1,Mirdametinib,F8-1,F8,9.0,0.836476,0.836476,...,Mirdametinib,MEK1/2,Inhibits MEK1/2,Small Molecule,Kinase Inhibitor,Mirdametinib,MEK1/2,Inhibits MEK1/2,Small Molecule,Kinase Inhibitor
3,NF0014,18,nM,10,Staurosporine,G11-1,G11,1.0,0.519897,0.519897,...,Staurosporine,PKC,Inhibits PKC,Small Molecule,Experimental,Staurosporine,PKC,Inhibits PKC,Small Molecule,Experimental
4,NF0014,24,uM,1,Binimetinib,G7-1,G7,7.0,-0.169507,-0.169507,...,Binimetinib,MEK1/2,Inhibits MEK1/2,Small Molecule,Kinase Inhibitor,Binimetinib,MEK1/2,Inhibits MEK1/2,Small Molecule,Kinase Inhibitor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1476,SARCO361,15,uM,10,Binimetinib,C8-5,C8,2.0,0.527569,0.527569,...,Binimetinib,MEK1/2,Inhibits MEK1/2,Small Molecule,Kinase Inhibitor,Binimetinib,MEK1/2,Inhibits MEK1/2,Small Molecule,Kinase Inhibitor
1477,SARCO361,40,uM,1,Rapamycin,F5-1,F5,4.0,0.208243,0.208243,...,Rapamycin,mTOR,Inhibits mTOR,Small Molecule,Kinase Inhibitor,Rapamycin,mTOR,Inhibits mTOR,Small Molecule,Kinase Inhibitor
1478,SARCO361,3,nM,10,Staurosporine,C11-5,C11,,-3.999578,-3.999578,...,Staurosporine,PKC,Inhibits PKC,Small Molecule,Experimental,Staurosporine,PKC,Inhibits PKC,Small Molecule,Experimental
1479,SARCO361,7,uM,1,Imatinib,D6-7,D6,3.0,0.949983,0.949983,...,Imatinib,MEK1/2,Inhibits PI3K,Small Molecule,Kinase Inhibitor,Imatinib,MEK1/2,Inhibits PI3K,Small Molecule,Kinase Inhibitor


In [14]:
metadata_df

Unnamed: 0,patient,object_id,unit,dose,treatment,image_set,Well,single_cell_count
0,NF0014,40,uM,10,Mirdametinib,G8-1,G8,8.0
1,NF0014,24,uM,1,Fimepinostat,D5-1,D5,5.0
2,NF0014,85,uM,1,Mirdametinib,F8-1,F8,9.0
3,NF0014,18,nM,10,Staurosporine,G11-1,G11,1.0
4,NF0014,24,uM,1,Binimetinib,G7-1,G7,7.0
...,...,...,...,...,...,...,...,...
1476,SARCO361,15,uM,10,Binimetinib,C8-5,C8,2.0
1477,SARCO361,40,uM,1,Rapamycin,F5-1,F5,4.0
1478,SARCO361,3,nM,10,Staurosporine,C11-5,C11,
1479,SARCO361,7,uM,1,Imatinib,D6-7,D6,3.0


In [13]:
features_df

Unnamed: 0,Colocalization_Organoid_AGP.BF_MEAN.CORRELATION.COEFF,Colocalization_Organoid_AGP.BF_MEDIAN.CORRELATION.COEFF,Colocalization_Organoid_AGP.BF_MIN.CORRELATION.COEFF,Colocalization_Organoid_AGP.BF_MAX.CORRELATION.COEFF,Colocalization_Organoid_AGP.BF_MEAN.MANDERS.COEFF.M1,Colocalization_Organoid_AGP.BF_MEDIAN.MANDERS.COEFF.M1,Colocalization_Organoid_AGP.BF_MIN.MANDERS.COEFF.M1,Colocalization_Organoid_AGP.BF_MAX.MANDERS.COEFF.M1,Colocalization_Organoid_AGP.BF_MEAN.MANDERS.COEFF.M2,Colocalization_Organoid_AGP.BF_MEDIAN.MANDERS.COEFF.M2,...,Treatment_x,Pathway_x,Function_x,Class_x,Therapeutic Categories_x,Treatment_y,Pathway_y,Function_y,Class_y,Therapeutic Categories_y
0,-0.777016,-0.777016,-0.777016,-0.777016,0.250001,0.250001,0.250001,0.250001,-1.577356,-1.577356,...,Mirdametinib,MEK1/2,Inhibits MEK1/2,Small Molecule,Kinase Inhibitor,Mirdametinib,MEK1/2,Inhibits MEK1/2,Small Molecule,Kinase Inhibitor
1,1.217722,1.217722,1.217722,1.217722,0.250001,0.250001,0.250001,0.250001,-2.138977,-2.138977,...,Fimepinostat,mTOR;NF-kB; MEK1/2,Inhibits PI3K,Small Molecule,Investigational,Fimepinostat,mTOR;NF-kB; MEK1/2,Inhibits PI3K,Small Molecule,Investigational
2,0.836476,0.836476,0.836476,0.836476,0.250001,0.250001,0.250001,0.250001,-1.280199,-1.280199,...,Mirdametinib,MEK1/2,Inhibits MEK1/2,Small Molecule,Kinase Inhibitor,Mirdametinib,MEK1/2,Inhibits MEK1/2,Small Molecule,Kinase Inhibitor
3,0.519897,0.519897,0.519897,0.519897,0.250001,0.250001,0.250001,0.250001,-1.474639,-1.474639,...,Staurosporine,PKC,Inhibits PKC,Small Molecule,Experimental,Staurosporine,PKC,Inhibits PKC,Small Molecule,Experimental
4,-0.169507,-0.169507,-0.169507,-0.169507,0.250000,0.250000,0.250000,0.250000,-0.522808,-0.522808,...,Binimetinib,MEK1/2,Inhibits MEK1/2,Small Molecule,Kinase Inhibitor,Binimetinib,MEK1/2,Inhibits MEK1/2,Small Molecule,Kinase Inhibitor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1476,0.527569,0.527569,0.527569,0.527569,0.388212,0.388212,0.388212,0.388212,-0.675803,-0.675803,...,Binimetinib,MEK1/2,Inhibits MEK1/2,Small Molecule,Kinase Inhibitor,Binimetinib,MEK1/2,Inhibits MEK1/2,Small Molecule,Kinase Inhibitor
1477,0.208243,0.208243,0.208243,0.208243,0.430851,0.430851,0.430851,0.430851,-0.458107,-0.458107,...,Rapamycin,mTOR,Inhibits mTOR,Small Molecule,Kinase Inhibitor,Rapamycin,mTOR,Inhibits mTOR,Small Molecule,Kinase Inhibitor
1478,-3.999578,-3.999578,-3.999578,-3.999578,0.430851,0.430851,0.430851,0.430851,1.449376,1.449376,...,Staurosporine,PKC,Inhibits PKC,Small Molecule,Experimental,Staurosporine,PKC,Inhibits PKC,Small Molecule,Experimental
1479,0.949983,0.949983,0.949983,0.949983,0.430851,0.430851,0.430851,0.430851,0.737348,0.737348,...,Imatinib,MEK1/2,Inhibits PI3K,Small Molecule,Kinase Inhibitor,Imatinib,MEK1/2,Inhibits PI3K,Small Molecule,Kinase Inhibitor


In [None]:
print(features_df.columns.tolist())
# get thee dtype of the features_df
print(features_df.dtypes)