# Preprocessing Notebook

Notebook to run through data processing steps using default settings, and save the resulting dataframes for use in the accompanying notebooks

In [1]:
import cellPLATO as cp

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

OVERWRITE_DATAFRAMES = True

Finished running cellPLATO initialization and loaded config.
Initializing:  2022_12_29_11AM_HSPConly-on-EL08-AMD3100
Hypthesis testing using:  st.ttest_ind
Plots will be exported to:  D://Michael_Shannon/SHIRA_COLLAB/2022-10-24_HSPConly_OUTPUT/2022_12_29_11AM_HSPConly-on-EL08-AMD3100\2022-12-29_11-04-14-401340\plots/
Using unique embedding per dataset shortname:  2022_12_29_11AM_HSPConly-on-EL08-AMD3100
Exporting static Superplots


NameError: name 'DRAW_SUPERPLOTS_grays' is not defined

In [None]:
# Get the experiment list from the experiments listed in the config 
exp_list = cp.populate_experiment_list()
display(exp_list)

In [None]:
# Load, process and combine the dataframes (including segmentation and migration calculations)
comb_df = cp.combine_dataframes(exp_list)

In [None]:
comb_df = cp.measurement_pipeline(comb_df)

In [None]:
# Returns a filtered dataframe, while also adding included column to comb_df
comb_df, filt_counts = cp.apply_filters(comb_df)


In [None]:
# Process a time-averaged DataFrame
tavg_df = cp.time_average(comb_df)
display(tavg_df)

In [None]:
# Make summary calculations from time-averaged dataframe
#Per condition:
avg_df = cp.average_per_condition(tavg_df)

# Per replicate
repavg_df = cp.average_per_condition(tavg_df, avg_per_rep=True)

In [None]:
avg_df


In [None]:
# Dimension reduction pipeline
dr_df = cp.dr_pipeline(comb_df, dr_factors=cp.DR_FACTORS) 

In [None]:
# Clustering cell behavior
lab_dr_df = cp.hdbscan_clustering(dr_df, cluster_by=cp.CLUSTER_BY, plot=False)

In [None]:
# Run the trajectory clustering pipeline
lab_dr_df, traj_list, cluster_lst = cp.trajectory_clustering_pipeline(lab_dr_df, traj_factor='umap', dist_metric='hausdorff', filename_out='std_dr_df_traj')

In [None]:
cp.cluster_switching_pipeline(lab_dr_df)

In [None]:
if OVERWRITE_DATAFRAMES:
    # Save dataframes to shared data folder
    tavg_df.to_csv(cp.SAVED_DATA_PATH + 'tavg_df.csv')
    comb_df.to_csv(cp.SAVED_DATA_PATH + 'comb_df.csv')
    dr_df.to_csv(cp.SAVED_DATA_PATH + 'dr_df.csv')
    lab_dr_df.to_csv(cp.SAVED_DATA_PATH + 'lab_dr_df.csv')

### Alternatively, run all the pipelines in a single cell, generate and save all outputs

In [None]:
#Minimal pipelines:
comb_df = cp.combine_dataframes(cp.populate_experiment_list())
comb_df = cp.measurement_pipeline(comb_df)
filt_df, filt_counts = cp.apply_filters(comb_df)
dr_df = cp.dr_pipeline(filt_df, dr_factors=cp.DR_FACTORS) 
cp.comparative_visualization_pipeline(dr_df)

lab_dr_df = cp.cluster_analysis_pipeline(dr_df,cp.CLUSTER_BY)
lab_dr_df, traj_list, cluster_lst = cp.trajectory_clustering_pipeline(lab_dr_df, traj_factor=cp.CLUSTER_BY, dist_metric='hausdorff', filename_out='std_dr_df_traj')
cp.cluster_switching_pipeline(lab_dr_df)

# 