This notebook explores the data at hand to understand the distribution and shape, as well as perform some pca and UMAP analysis to understand the data better.

In [1]:
import pathlib
from pprint import pprint

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import seaborn as sns
import umap
from sklearn.decomposition import PCA

In [2]:
figure_dir = pathlib.Path("../figures").resolve()
output_data_dir = pathlib.Path("../data").resolve()
figure_dir.mkdir(parents=True, exist_ok=True)
output_data_dir.mkdir(parents=True, exist_ok=True)

data_subset = True

In [3]:
input_file_path_subset = pathlib.Path(
    "../data/first_time/live_cell_pyroptosis_wave1_sc_first_time_norm_fs_subset_testing_data.parquet"
).resolve()
input_file_path = pathlib.Path(
    "../../4.processing_profiled_features/data/feature_selected_data/live_cell_pyroptosis_wave1_sc_first_time_norm_fs.parquet"
).resolve()
output_data_dir = pathlib.Path("../data/first_time").resolve()
figure_dir = pathlib.Path("../figures/first_time").resolve()

In [4]:
random_state = 0
umap_reducer = umap.UMAP(
    n_neighbors=15, min_dist=0.1, n_components=2, random_state=random_state
)
max_pca_components = 100
scree_pca = PCA(n_components=max_pca_components)
pca_reducer = PCA(n_components=2, random_state=random_state)

if data_subset:
    data_df = pd.read_parquet(input_file_path_subset)
else:
    data_df = pd.read_parquet(input_file_path)
# set the UMAP parameters
# separate the data into features and labels
metadata_columns = data_df.columns[data_df.columns.str.contains("Metadata")]
metadata_columns_df = data_df[metadata_columns]
features_df = data_df.drop(metadata_columns, axis=1)
features_df = features_df.fillna(0)
print(
    f"Original data shape: {data_df.shape}, features shape: {features_df.shape}, and metadata shape: {metadata_columns_df.shape}"
)

# fit the UMAP model
umap_embedding = umap_reducer.fit_transform(features_df)
umap_embedding_df = pd.DataFrame(umap_embedding, columns=["UMAP0", "UMAP1"])
umap_embedding_df = pd.concat(
    [metadata_columns_df.reset_index(drop=True), umap_embedding_df], axis=1
)

umap_embeddings_file_path = pathlib.Path(
    output_data_dir / "umap_embeddings.parquet"
).resolve()
umap_embedding_df.to_parquet(umap_embeddings_file_path)
print(f"UMAP embedding shape: {umap_embedding_df.shape}")
umap_embedding_df.head()
# scree plot analysis
scree_pca.fit(features_df)
pca_variance = scree_pca.explained_variance_ratio_

scree_plot_file_path = pathlib.Path(output_data_dir / "scree_plot.parquet").resolve()
scree_plot_df = pd.DataFrame(pca_variance, columns=["Explained Variance"])
scree_plot_df["Principal Component"] = range(1, max_pca_components + 1)
scree_plot_df.to_parquet(scree_plot_file_path)

# perform PCA on the data

pca_embedding = pca_reducer.fit_transform(features_df)
pca_embedding_df = pd.DataFrame(pca_embedding, columns=["PCA0", "PCA1"])
pca_embedding_df = pd.concat(
    [metadata_columns_df.reset_index(drop=True), pca_embedding_df], axis=1
)

pca_embeddings_file_path = pathlib.Path(
    output_data_dir / "pca_embeddings.parquet"
).resolve()
pca_embedding_df.to_parquet(pca_embeddings_file_path)
print(f"PCA embedding shape: {pca_embedding_df.shape}")
pca_embedding_df.head()

Original data shape: (4800, 824), features shape: (4800, 790), and metadata shape: (4800, 34)


  warn(


UMAP embedding shape: (4800, 36)
PCA embedding shape: (4800, 36)


Unnamed: 0,Metadata_treatment,Metadata_Well,Metadata_number_of_singlecells,Metadata_FOV,Metadata_Plate,Metadata_treatment1,Metadata_treatment2,Metadata_treatment1_dose,Metadata_treatment1_unit,Metadata_treatment2_dose,...,Metadata_Image_PathName_CL488,Metadata_Image_PathName_CL561,Metadata_Image_PathName_DNA,Metadata_Image_PathName_GSDM,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_treatment_serum,Metadata_Time,PCA0,PCA1
0,H2O2 100 nM,M10,743,1,20241025T225218,H2O2,,100,nM,,...,/gpfs/alpine1/scratch/mlippincott@xsede.org/py...,/gpfs/alpine1/scratch/mlippincott@xsede.org/py...,/gpfs/alpine1/scratch/mlippincott@xsede.org/py...,/gpfs/alpine1/scratch/mlippincott@xsede.org/py...,532.767754,23.653551,H2O2 100 nM NuSerum,9,-16.91917,-7.710462
1,H2O2 100 nM,M10,743,1,20241025T225218,H2O2,,100,nM,,...,/gpfs/alpine1/scratch/mlippincott@xsede.org/py...,/gpfs/alpine1/scratch/mlippincott@xsede.org/py...,/gpfs/alpine1/scratch/mlippincott@xsede.org/py...,/gpfs/alpine1/scratch/mlippincott@xsede.org/py...,603.320416,26.482987,H2O2 100 nM NuSerum,9,-8.894768,-4.441131
2,H2O2 100 nM,M10,743,1,20241025T225218,H2O2,,100,nM,,...,/gpfs/alpine1/scratch/mlippincott@xsede.org/py...,/gpfs/alpine1/scratch/mlippincott@xsede.org/py...,/gpfs/alpine1/scratch/mlippincott@xsede.org/py...,/gpfs/alpine1/scratch/mlippincott@xsede.org/py...,1216.385397,24.80254,H2O2 100 nM NuSerum,9,-11.016948,-3.638722
3,H2O2 100 nM,M10,743,1,20241025T225218,H2O2,,100,nM,,...,/gpfs/alpine1/scratch/mlippincott@xsede.org/py...,/gpfs/alpine1/scratch/mlippincott@xsede.org/py...,/gpfs/alpine1/scratch/mlippincott@xsede.org/py...,/gpfs/alpine1/scratch/mlippincott@xsede.org/py...,182.040942,26.639713,H2O2 100 nM NuSerum,9,-12.297045,-6.859914
4,H2O2 100 nM,M10,743,1,20241025T225218,H2O2,,100,nM,,...,/gpfs/alpine1/scratch/mlippincott@xsede.org/py...,/gpfs/alpine1/scratch/mlippincott@xsede.org/py...,/gpfs/alpine1/scratch/mlippincott@xsede.org/py...,/gpfs/alpine1/scratch/mlippincott@xsede.org/py...,1918.977283,28.768102,H2O2 100 nM NuSerum,9,-9.081488,-2.338611
