In [1]:
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tqdm
from sklearn.decomposition import PCA

In [2]:
def fit_pca_to_the_first_timepoint(
    df: pd.DataFrame,
    timepoint_column: str = "Metadata_Time",
    metadata_columns: list = None,
    feature_columns: list = None,
    pca_model: PCA = None,
) -> pd.DataFrame:
    """
    This function fits a pca model to the first timepoint of the data and then applies the model to the rest of the data.

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe containing all feature, metadata, and timepoint columns.
    timepoint_column : str, optional
        The name of the column containing the timepoint information, by default "Metadata_Time"
    metadata_columns : list, optional
        The names of the columns containing the metadata information, by default None
    feature_columns : list, optional
        The names of the columns containing the feature information, by default None
    pca_model : pca.pca, optional
        The pca model to use, by default None. If None, a new pca model will be created with default parameters.

    Returns
    -------
    pd.DataFrame
        The pca embeddings for the data, with the metadata columns included.
    """

    df = df.copy()
    metadata_df = df[metadata_columns]

    # get the first timepoint and the subset of the data for that timepoint
    first_time = df[timepoint_column].min()
    first_timepoint_subset_df = df[df[timepoint_column] == first_time]

    # Prepare the first timepoint subset by dropping metadata columns, selecting feature columns, and removing rows with missing values
    first_timepoint_subset_df = first_timepoint_subset_df.drop(metadata_columns, axis=1)
    first_timepoint_subset_df = first_timepoint_subset_df[feature_columns]
    first_timepoint_subset_df = first_timepoint_subset_df.dropna(axis=0)
    # fit the model to the first timepoint
    _ = pca_model.fit_transform(first_timepoint_subset_df)

    # get the rest of the data fo transformation
    df = df.drop(metadata_columns, axis=1)
    df = df[feature_columns]
    df.dropna(axis=0, inplace=True)
    metadata_df = metadata_df.loc[df.index]
    df.reset_index(drop=True, inplace=True)
    metadata_df.reset_index(drop=True, inplace=True)

    # apply the model to the rest of the data
    pca_embeddings = pca_model.transform(df)
    # create a dataframe with the pca fit and the metadata
    pca_df = pd.DataFrame(pca_embeddings, columns=["PCA0", "PCA1"])
    # add the metadata to the dataframe
    pca_df = pd.concat([pca_df, metadata_df], axis=1)

    return pca_df

In [3]:
CP_scDINO_profile_file_path = pathlib.Path(
    "../../data/CP_scDINO_features/combined_CP_scDINO_norm_fs.parquet"
).resolve(strict=True)
df = pd.read_parquet(CP_scDINO_profile_file_path)
df.head()

Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO
0,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,1.0,...,0.455522,0.39468,1.649389,0.010468,0.426432,-0.339073,1.587193,-0.362701,0.812067,1.215035
1,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,15.0,...,1.055508,0.703437,-1.417191,0.261842,1.097767,-1.700814,0.564109,0.238833,0.718067,0.768177
2,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,16.0,...,-0.834625,0.309778,-0.677732,1.00843,-0.144541,-1.514545,-0.609423,0.447713,1.357439,1.000067
3,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,26.0,...,-1.519487,1.510775,-0.939313,-0.584098,2.944786,-0.227158,0.076995,-1.431423,-0.097869,0.970456
4,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,37.0,...,-1.171937,0.214552,2.372796,-0.591718,-0.306193,0.506419,-0.360647,1.294736,1.949604,0.107606


In [4]:
metadata_columns = [x for x in df.columns if "Metadata" in x]
scDINO_columns = [x for x in df.columns if "scDINO" in x]
CP_columns = df.drop(columns=metadata_columns + scDINO_columns).columns
CP_scDINO_columns = df.drop(metadata_columns, axis=1).columns

feature_set_dict = {
    "scDINO": scDINO_columns,
    "CP": CP_columns,
    "CP_scDINO": CP_scDINO_columns,
}

In [5]:
pca_model = PCA(n_components=2)

In [6]:
for feature_set_name, feature_set in tqdm.tqdm(feature_set_dict.items()):
    pca_df = fit_pca_to_the_first_timepoint(
        df,
        timepoint_column="Metadata_Time",
        metadata_columns=metadata_columns,
        feature_columns=feature_set,
        pca_model=pca_model,
    )
    # set the save path of the pca data
    pca_save_path = pathlib.Path(
        f"../results/pca/{feature_set_name}_pca.parquet"
    ).resolve()
    pca_save_path.parent.mkdir(parents=True, exist_ok=True)
    # save the pca data
    pca_df.to_parquet(pca_save_path, index=False)

100%|██████████| 3/3 [00:52<00:00, 17.39s/it]
