# CellPLATO | Cell Plasticity Analysis Tool (Trackmate version)

Step 1: Fill in the config file!

Also, make sure your data is set up in the following two tiered format:

        Master
            ├── Condition 1
            │   ├── Replicate 1
            |   |       ├── tracks.h5
            │   ├── Replicate 2
            |   |       ├── tracks.h5            
            │   └── Replicate 3
            |           └── tracks.h5            
            │  
            └── Condition 2,
                ├── Replicate 1
                |       ├── tracks.h5
                ├── Replicate 2
                |       ├── tracks.h5            
                └── Replicate 3
                        └── tracks.h5    

<div class="alert alert-block alert-danger">
Set your kernel to 'cellPLATO' before continuing
</div>

<div class="alert alert-block alert-success">
<h2>1. Start by importing packages for cellPLATO</h1>
</div>

This includes cellPLATO itself, and all of the modules you will need

* Import these packages, checking that you have them
* We're also importing a lot of the modules in cellPLATO, if this cell runs successfully, you are good to go!

In [None]:
import cellPLATO as cp

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import imageio

import plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
from pandas.plotting import scatter_matrix
from matplotlib import pyplot
import matplotlib.cm as cm
import plotly.graph_objects as go
import plotly.express as px
import re
import glob
import pandas as pd
from tqdm.notebook import tqdm
import numpy as np
import requests
import zipfile
import ipywidgets as widgets
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
import itertools
from matplotlib.gridspec import GridSpec
import requests


OVERWRITE_DATAFRAMES = True



# Import your experiment list

Check that the list generated in the next cell contains your conditions and replicates

In [None]:
# Get the experiment list from the experiments listed in the config 
exp_list = cp.populate_experiment_list()
display(exp_list)
print(cp.SAVED_DATA_PATH)

### Format the trackmate dataframe

This part was inspired by Guillaume Jacquemet's trackmate processing modules

In [None]:
merged_spots_df, spots_metadata = cp.load_and_populate(r'.*spots.*\.csv')

merged_tracks_df, tracks_metadata = cp.load_and_populate(r'.*tracks.*\.csv')

### Change the trackmate dataframe to the cellPLATO format

In [None]:
comb_df = cp.trackmate_to_cellPLATO(merged_spots_df)

## Time window decision: before proceeding, let's figure out what time window to use for the cellPLATO migration features

In [None]:
# Check your current time window settings
analysis = cp.analyze_time_window_settings(comb_df)

In [None]:
# if you want to change the time window settings, you can do so in the config.py file at this line:
# MigrationTimeWindow_minutes = 12.0 

<div class="alert alert-block alert-success">
<h2>2. Measurements of morphology and migration</h1>
</div>

### This cell does migration and morphology measurements for all of the cells at each timepoint 

In [None]:
comb_df, new_factors = cp.measurement_pipeline(comb_df, mixed=cp.MIXED_SCALING, factors_to_timeaverage = cp.ALL_FACTORS) 
display(new_factors)

# Returns a filtered dataframe, while also adding included column to comb_df
comb_df, filt_counts = cp.apply_filters(comb_df)

# Process a time-averaged DataFrame
tavg_df = cp.time_average_trackmate(comb_df)

As the trackmate data had missing frames due to missed detections, one can fill them with 
### interpolation or split the tracks.
Here I chose interpolation. I believe this is an option in trackmate, and is certainly an option in other tracking methodologies. If it is not done upstream, you can do it here.

In [None]:
# Fix gaps with chosen method
comb_df = cp.fix_track_gaps(
    comb_df, 
    method='fill',  # 'fill', 'split', or 'auto'
    verbose=True
)

In [None]:
OVERWRITE_DATAFRAMES = True

if OVERWRITE_DATAFRAMES:
    comb_df.to_csv(cp.SAVED_DATA_PATH + 'comb_df.csv', index=False)
    tavg_df.to_csv(cp.SAVED_DATA_PATH + 'tavg_df.csv', index=False)

In [None]:
# load the tavg_df from csv
comb_df = pd.read_csv(cp.SAVED_DATA_PATH + 'comb_df.csv')
tavg_df = pd.read_csv(cp.SAVED_DATA_PATH + 'tavg_df.csv')

### Define trackmate factors

In [None]:
# TrackMate morphological and intensity features
TRACKMATE_FEATURES = [
    'RADIUS', 'VISIBILITY', 'MEAN_INTENSITY_CH1',
    'MEDIAN_INTENSITY_CH1', 'MIN_INTENSITY_CH1', 'MAX_INTENSITY_CH1',
    'TOTAL_INTENSITY_CH1', 'STD_INTENSITY_CH1', 'MEAN_INTENSITY_CH2',
    'MEDIAN_INTENSITY_CH2', 'MIN_INTENSITY_CH2', 'MAX_INTENSITY_CH2',
    'TOTAL_INTENSITY_CH2', 'STD_INTENSITY_CH2', 'CONTRAST_CH1', 'SNR_CH1',
    'CONTRAST_CH2', 'SNR_CH2', 'ELLIPSE_X0', 'ELLIPSE_Y0', 'ELLIPSE_MAJOR',
    'ELLIPSE_MINOR', 'ELLIPSE_THETA', 'ELLIPSE_ASPECTRATIO', 'AREA',
    'PERIMETER', 'CIRCULARITY', 'SOLIDITY', 'SHAPE_INDEX'
]


### Define cellPLATO migration factors

In [None]:
# Migration parameters calculated by cellPLATO
MIGRATION_FEATURES = [
    'euclidean_dist', 'segment_length', 'cumulative_length', 'speed',
    'orientedness', 'directedness', 'turn_angle', 'endpoint_dir_ratio',
    'dir_autocorr', 'outreach_ratio', 'MSD', 'max_dist', 'glob_turn_deg',
    'arrest_coefficient', 'rip_p', 'rip_K', 'rip_L'
]

### Put them all together!

In [None]:
DR_FACTORS = TRACKMATE_FEATURES + MIGRATION_FEATURES

### Make a plot of any factor

In [None]:
f=cp.plots_of_differences_sns(tavg_df,factor='SHAPE_INDEX')
f.show()

### Make a timeplot of any factor

In [None]:
# Use filt_df or comb_df depending on what you want to see
f=cp.multi_condition_timeplot(comb_df, factor='CONTRAST_CH1')
f.show()

### Optional: do filtering on the data (on top of what has been stated in the config file)

In [None]:
# User-defined filters in dict {factor:(min, max)}

data_filters = {
#   "speed": (10, 100),
  "AREA": (1, 10000),
#    "frame": (0, 450), # Warning: range will change if self-normalized
  # "ntpts": (2,1800)
}

# Returns a filtered dataframe, while also adding included column to comb_df
filt_df, filt_counts = cp.apply_filters(comb_df,how='any', filter_dict=data_filters)

fig = cp.visualize_filtering(filt_df, filt_counts)


# Plot all metrics

This cell makes comparative plots for every single metric and saves them in your output folder

* Plots of difference
* Timeplots of difference
* Marginal xy plots
* Simple bar plots
* Superplots - useful for comparing between replicates

<div class="alert alert-block alert-danger">
Check that you are happy with your extra filtering before continuing
Run the next cell on the filtered dataframe or the unfiltered dataframe once you are ready
</div>

In [None]:
# Outputs plots of all metrics for all factors
# cp.comparative_visualization_pipeline(comb_df, num_factors=DR_FACTORS) 

<div class="alert alert-block alert-success">
<h2>3. Definition of single timepoint behavioural clusters using UMAP and HDBSCAN</h1>
</div>

Here, you should pay attention to which factors you choose for dimensionality reduction. The following section provides some ways to aid that decision, to be combined with biological knowledge as to which factors are important

### Perform correlation analysis to understand which factors correlate to one another

This helps to avoid picking factors that are very similar to one another 

In [None]:
df_in = comb_df
cp.correlation_matrix_heatmap(df_in, factors = cp.ALL_FACTORS)

### Optional: use variance thresholder for further insight

This works using scikitlearns variance measurement

Importantly, you have to centre scale the data prior to doing this, in order to match the dimension reduction and cluster analysis you do later.

Several scaling methods are avaialable. In order to use log2 and minmax scaling (which is called 'choice') and was used in the paper, you can automatically figure out which features are chosen to be log2 transformed and then minmax-ed, versus simply minmax-ed.

Printouts detail why a decision was made, as well as histograms to show the scale, skew and spread of the data.

In [None]:
# First analyze your factors to optimize scaling
factor_analysis = cp.analyze_factors_for_choice_scaling(
    df=comb_df, 
    factors_list=DR_FACTORS,
    show_distributions=True
)



Then, you can apply the correct scaling to the metrics, and measure their variance contribution. Scikitlearn uses:

Variance = Σ(xᵢ - μ)² / (n - 1)

In [None]:
# Then apply variance threshold with optimized scaling
chosen_dr_factors = cp.variance_threshold(
    df_in=comb_df,
    threshold_value=0.03, 
    dr_factors=DR_FACTORS,
    scaling_method='choice',
    factors_to_transform=factor_analysis['suggested_to_transform'],
    factors_not_to_transform=factor_analysis['suggested_not_to_transform']
)

In [None]:
# I will remove rip_p and rip_k, only using ripleys L as it is the most useful!
chosen_dr_factors.remove('rip_p')
chosen_dr_factors.remove('rip_K')

## NOTE: there are other scaling options available! This combo of log2 and minmax was what I used in the paper. Others include standardscaler, the most popular. Search 'center scaling' to find out more about them.

### Handling NaNs. 

There are NaNs in the data. Some are normal, at the start and end of tracks due to time windows not being large enough. But there were many more on top of this.
I found from the following analysis that there are sometimes missing consecutive frames in tracks, due to the way trackmate works. Run the following function to show that.

In [None]:
# Analyze individual tracks to see where NaNs come from
track_analysis = cp.analyze_individual_tracks_for_nans(comb_df, track_id_col='uniq_id', n_tracks_to_analyze=5)

In [None]:
# Investigate the NaN problem
investigation_results = cp.investigate_nan_causes(comb_df, verbose=True)

In [None]:
# Look at a single track in detail
single_track = comb_df[comb_df['uniq_id'] == comb_df['uniq_id'].iloc[0]].sort_values('frame')
print("Track frames:", single_track['frame'].values[:20])
print("Track length:", len(single_track))
print("Frame range:", single_track['frame'].min(), "to", single_track['frame'].max())
print("Has gaps:", len(set(range(int(single_track['frame'].min()), int(single_track['frame'].max()) + 1))) != len(single_track))

To solve gaps in your data, one can interpolate positions where there are gaps in tracks 'fill' or one can 'split' tracks to make them into new IDs. If you are going to do this, you need to go back to the beginning and do the calculations on the new dataframe.

### Fix gaps with chosen method
comb_df_fixed = cp.fix_track_gaps(
    comb_df, 
    method='fill',  # 'fill', 'split', or 'auto'
    verbose=True
)



### Once there are no gaps in the data due to missed detections, there will still be a few gaps at the start and end of tracks due to the use of time windows.
This is a trade off for using time windows, which gives the data much less bias, but loses a bit of data (3 frames at the start and end, in this case).
You can either drop those rows entirely or remove the cellPLATO migration factors, calculated over time windows.

In [None]:
# Just analyze without changing anything
df_clean, factors_clean, report = cp.handle_nan_for_dr(
    comb_df, 
    DR_FACTORS, 
    method='drop_rows'
)

# auto': Drop factors with >nan_threshold% NaN, then drop remaining NaN rows
# 'drop_rows': Drop all rows containing any NaN values in DR factors
# 'drop_factors': Drop all factors containing any NaN values
# 'analyze_only': Just analyze and report, don't modify data nan_threshold : float For 'auto' method: percentage threshold for dropping factors (default 30%) verbose : bool Print detailed information


In [None]:
# Use the suggestions or customize them
factors_to_log_transform = factor_analysis['suggested_to_transform']
factors_minmax_only = factor_analysis['suggested_not_to_transform']

# Update your custom factor lists to only include clean factors
factors_to_log_transform_clean = [f for f in factors_to_log_transform if f in factors_clean]
factors_minmax_only_clean = [f for f in factors_minmax_only if f in factors_clean]

print(f"Clean factors to log-transform: {len(factors_to_log_transform_clean)}")
print(f"Clean factors for minmax only: {len(factors_minmax_only_clean)}")

In [None]:
# remove rip_p and rip_k
factors_to_log_transform_clean = [f for f in factors_to_log_transform_clean if f not in ['rip_p', 'rip_k']]

In [None]:
# Reset indices 
df_clean = df_clean.reset_index(drop=True)

In [None]:
# save the dataframe
df_clean.to_csv('df_clean.csv', index=False)

In [None]:
# load the dataframe
df_clean = pd.read_csv('df_clean.csv')

## Perform UMAP and cluster analysis

Well separated clusters depend mostly on 1. the input factors and 2. the umap_nn setting

You can change both, depending on the nature of your data, in order to achieve a reasonable level of separation of clusters

In [None]:
###### User alterable parameters ######
tsne_perp=150
umap_nn = 30 #umap nearest neighbours
min_dist = 0.0 #umap minimum distance (usually keep this at 0 or very low)
n_components = 3 # number of umap dimensions to calculate
#######################################

dr_df = cp.dr_pipeline_multiUMAPandTSNE(
    df_clean, 
    dr_factors=DR_FACTORS,
    n_components=n_components,
    umap_nn=umap_nn,
    min_dist=min_dist,
    scalingmethod='choice',  # Use choice scaling
    factors_to_transform=factors_to_log_transform_clean,      # Your custom list
    factors_not_to_transform=factors_minmax_only_clean,       # Your custom list
    do_tsne=False
)

In [None]:
cp.plot_3D_scatter(dr_df, 'UMAP1', 'UMAP2', 'UMAP3', colorby='condition', ticks=False, identifier='dr_df' + '_byCONDITION_',dotsize = 8, alpha=0.2, markerscale = 10) #color = label or condition  

In [None]:
display(dr_df[dr_df.isnull().any(axis=1)])
dr_df=dr_df.dropna()

### Second, identify clusters and exemplar cells using HDBSCAN

In [None]:
##### User adjustable parameters #####
min_cluster_size = 300
min_samples = 200
cluster_by = 'UMAPNDIM' # UMAPNDIM = default, clusters on UMAPs. NDIM = alternate, clusters on all dimensions
metric = 'euclidean' # See https://hdbscan.readthedocs.io/en/latest/api.html#hdbscan.HDBSCAN for options
#######################################

lab_dr_df, exemplar_df=cp.hdbscan_clustering(dr_df, min_cluster_size=min_cluster_size, min_samples=min_samples, cluster_by=cluster_by,  metric=metric)

lab_dr_df.name='lab_dr_df'
name = lab_dr_df.name

lab_dr_df.to_csv(cp.SAVED_DATA_PATH + 'lab_dr_df.csv', index=False)
exemplar_df.to_csv(cp.SAVED_DATA_PATH + 'exemplar_df.csv', index=False)

cp.plot_3D_scatter(lab_dr_df, 'UMAP1', 'UMAP2', 'UMAP3', colorby='label', ticks=False, identifier=name + '_byCLUSTERID___',dotsize = 8, alpha=0.2, markerscale = 10) #color = label or condition   


In [None]:
# save the lab_dr_df to a csv file
lab_dr_df.to_csv(cp.SAVED_DATA_PATH + 'lab_dr_df.csv', index=False)

In [None]:
#checkpoint - load the lab_dr_df from the csv file
lab_dr_df = pd.read_csv(cp.SAVED_DATA_PATH + 'lab_dr_df.csv')

## Then plot the 'fingerprint' plot of percentage in each cluster per condition

In [None]:
# This is the new combo
cluster_purity_df = cp.purity_pointsinclusterspercondition(lab_dr_df) 
display(cluster_purity_df)
f = cp.purityplot_percentcluspercondition(lab_dr_df, cluster_purity_df) 

### Optional: explore the clusters with interactive 3D plot

In [None]:
cp.interactive_plot_3D_UMAP(df=lab_dr_df,colorby = 'label', symbolby = 'Condition_shortlabel', what = ' AllTimeUMAPwithclusters') # TavgUMAPwithclusters

### Optional: all other conditions colored grey, chosen condition in color

In [None]:
df=lab_dr_df

condlist = df['Condition_shortlabel'].unique().tolist() #get unique list of conditions from df
print(condlist) # show the condition list
# chosen_condition = '' #specify a chosen condition from the list
chosen_condition = condlist[0] # or choose the first one
print(chosen_condition)

cp.interactive_plot_3D_UMAP_chosen_condition(df, chosen_condition, opacity_grey=0.3, marker_size_all=5,) #change opacity and marker size to suit the data

### Optional: make UMAP plots colored by metric contributors - the more intense the color, the higher the contribution the metric to a cluster

In [None]:
# First one colors per metric
cp.plot_UMAP_subplots_coloredbymetricsorconditions(df_in=lab_dr_df, x= 'UMAP1', y= 'UMAP2', z = 'UMAP3', n_cols = 5, ticks=False, metrics = DR_FACTORS, scalingmethod='choice',
                                                   identifier='inferno', colormap='inferno', coloredbycondition = False, samplethedf = False)
#second one colors per condition
# cp.plot_UMAP_subplots_coloredbymetricsorconditions(df_in=tptlabel_dr_df, x= 'UMAP1', y= 'UMAP2', z = 'UMAP3', n_cols = 5, ticks=False, metrics = cp.ALL_FACTORS, scalingmethod='choice',
#                                                    identifier='inferno', colormap='inferno', coloredbycondition = True, samplethedf = False)

### Perform UMAP then HDBSCAN on the tavg_df

### at the moment, just do this step as it is needed for compatibility later on

In [None]:
lab_dr_df['tavg_label'] = np.random.randint(0, 5, lab_dr_df.shape[0])
lab_tavg_lab_dr_df = lab_dr_df

In [None]:
OVERWRITE_DATAFRAMES = True

if OVERWRITE_DATAFRAMES:
    lab_tavg_lab_dr_df.to_csv(cp.SAVED_DATA_PATH + 'lab_tavg_dr_df.csv', index=False)


In [None]:
# load that df
lab_tavg_lab_dr_df = pd.read_csv(cp.SAVED_DATA_PATH + 'lab_tavg_dr_df.csv')

## Quantify the plasticity 

This part counts how many times cells switch between those clusters we defined over their lifetime

The function now returns these cluster change metrics:

cum_n_changes, cum_n_labels (cumulative)
twind_n_changes, twind_n_labels (time-windowed)

In [None]:
tptlabel_dr_df = cp.count_cluster_changes_with_tavg(lab_tavg_lab_dr_df)
tptlabel_dr_df.to_csv(cp.SAVED_DATA_PATH + 'tptlabel_dr_df.csv', index=False)

### Plots of plasticity

In [None]:
tptlabel_dr_df = pd.read_csv(cp.SAVED_DATA_PATH + 'tptlabel_dr_df.csv')

In [None]:
df=tptlabel_dr_df
# all='\_allcells'
cp.plot_plasticity_changes(df, identifier='\_allcells', maxy=4) #problem with NaNs in the data

In [None]:
df=tptlabel_dr_df
cp.plot_plasticity_countplots(df, identifier='_allcells')

In [None]:
df=tptlabel_dr_df
cp.plot_cumulative_plasticity_changes_main(df, identifier='\_allcells', miny=None, maxy=None, t_window_multiplier = cp.T_WINDOW_MULTIPLIER, plotallcells = False)

# Disambiguate the clusters

### First, choose a number of exemplar cells to pick out from the exemplar cell list to display

In [None]:
# Choose a number of exemplars to display for each cluster
n=2
exemplar_df = exemplar_df.groupby('label').apply(lambda x: x.sample(min(n,len(x)))).reset_index(drop=True)

In [None]:
###################### No need to run these commented lines if you have already chosen the factors to use using this method above ########################

# # Get factor analysis results
# factor_analysis = cp.analyze_factors_for_choice_scaling(
#     df=comb_df, 
#     factors_list=DR_FACTORS,
#     show_distributions=True
# )

# # Use the suggestions
# factors_to_log_transform = factor_analysis['suggested_to_transform']
# factors_minmax_only = factor_analysis['suggested_not_to_transform']

# # Clean the factors
# factors_to_log_transform_clean = [f for f in factors_to_log_transform if f in factors_clean]
# factors_minmax_only_clean = [f for f in factors_minmax_only if f in factors_clean]

# Use in contribution_to_clusters
top_dictionary, contributions_df_singletpoints, scaled_df = cp.contribution_to_clusters(
    df_in=tptlabel_dr_df,  
    howmanyfactors=3, 
    dr_factors=chosen_dr_factors,
    scalingmethod='choice',
    factors_to_log_transform_clean=factors_to_log_transform_clean,
    factors_minmax_only_clean=factors_minmax_only_clean
)

In [None]:
cp.plot_cluster_averages(top_dictionary, df, scaled_df)

In [None]:


result_df = cp.create_cluster_averages_table(top_dictionary, df, scaled_df)


In [None]:
# you can't do this without the contours (segmentations), which you don't have here

cp.disambiguate_timepoint(df, exemplar_df, scaled_df, top_dictionary=top_dictionary, XYRange=size,boxoff=True, trajectory = False) 

In [None]:

tptlabel_dr_df = pd.read_csv(cp.SAVED_DATA_PATH + 'tptlabel_dr_df.csv')
exemplar_df = pd.read_csv(cp.SAVED_DATA_PATH + 'exemplar_df.csv')

## Then, to visualize single cells with many timepoints, select cells with lots of timepoints

Filter cells by trajectory length: Only keeps cells that have more than numberofdesiredtimepoints timepoints
Sample cells per cluster: Tries to get numberofcellspercluster cells from each cluster
Create exemplar datasets: Outputs both a filtered exemplar dataframe and full trajectory tracks for those cells

In [None]:
#### User inputs ####
whole_df = tptlabel_dr_df
exemplar_df = exemplar_df
numberofdesiredtimepoints = int(whole_df['ntpts'].mean())
# numberofdesiredtimepoints = 200
numberofcellspercluster = 40
num_clusters_whole_dataset = len(whole_df['label'].unique())

override = int((numberofcellspercluster*num_clusters_whole_dataset)*0.7)
#####################

# exemplar_df_filt, exemplar_cell_tracks_df = cp.filter_exemplars(whole_df=whole_df, exemplar_df = exemplar_df, numberofdesiredtimepoints = numberofdesiredtimepoints, 
#                                                                     numberofcellspercluster = numberofcellspercluster, override = override)

exemplar_df_filt, exemplar_cell_tracks_df = cp.filter_exemplars(
    whole_df=whole_df, 
    exemplar_df=exemplar_df, 
    numberofdesiredtimepoints=numberofdesiredtimepoints,
    numberofcellspercluster=numberofcellspercluster, 
    override=override,
    verbose=False  # This eliminates most printouts
)

In [None]:
df=exemplar_cell_tracks_df
# cp.plot_cumulative_plasticity_changes_test2(df, identifier='\_exemplars_only_3_df__', miny=None, maxy=None, t_window_multiplier = 1, plotallcells = True) #deprecated, use the small multiples version
cp.plot_cumulative_plasticity_changes_main(df, identifier='\_exemplars_only_3_df__', miny=None, maxy=None, t_window_multiplier = 1, plotallcells = False)

### Plot any factor as small multiples from the exemplars

In [None]:
df = exemplar_cell_tracks_df
whichcolumntoplot = 'label'

cp.plot_small_multiples(df, whichcolumntoplot)

<div class="alert alert-block alert-success">
<h2>4. Trajectory measurement: Damerau-Levenshtein</h1>
</div>

In [None]:
tptlabel_dr_df = pd.read_csv(cp.SAVED_DATA_PATH + 'tptlabel_dr_df.csv')

In [None]:
# count the number of points per unique id (uniq_id)
temp_df = dr_df.copy()
numpoints_df = temp_df.groupby('uniq_id').size().reset_index(name='numpoints')


In [None]:
# histogram of numpoints
plt.hist(numpoints_df['numpoints'], bins=100)

### First filter the tptlabel_dr_df to include only a subset of data of similar timescale

In [None]:
low = 5
high = 60

tptlabel_dr_df_filt = tptlabel_dr_df[tptlabel_dr_df['ntpts'].between(low, high)]

### Verify that the filtered data reflects the total data

In [None]:
factorchoice = 'speed'

In [None]:
# Makes timeplots of the unfiltered and filtered data

f=cp.multi_condition_timeplot(tptlabel_dr_df, factorchoice)
f.show()
f=cp.multi_condition_timeplot(tptlabel_dr_df_filt, factorchoice)
f.show()

In [None]:
# Plot of difference of the unfiltered and filtered data
f = cp.plots_of_differences_sns(tavg_df, factorchoice)
f.show()

In [None]:
tavg_trajectory_df = cp.time_average(tptlabel_dr_df)
f = cp.plots_of_differences_sns(tavg_trajectory_df, factorchoice)
f.show()

### Perform Damerau-Levenshtein analysis

In [None]:
df = tptlabel_dr_df_filt
distance_matrix_dameraulev = cp.calculate_edit_distances(df,distancemetric = 'dameraulev', print_interval=10000) #fastdtw # dameraulev # mongeelkan
print(distance_matrix_dameraulev.shape)

In [None]:
# Save the distance matrix
# np.save(cp.SAVED_DATA_PATH + 'distance_matrix_dameraulev.npy', distance_matrix_dameraulev)

### Perform a UMAP/HDBSCAN parameter sweep, and select plots

In [None]:
'''Sweep'''

df = tptlabel_dr_df_filt
for n_neighbors in [8, 10, 12]:
    for min_samples in [5,8,10, 15, 30, 40]:
        for min_cluster_size in [5,8,10, 15, 30, 40]:
            print(f'min_samples = {min_samples}')
            print(f'min_cluster_size = {min_cluster_size}')
            print(f'n_neighbors = {n_neighbors}')
            tptlabel_dr_df_filt_clusteredtrajectories = cp.cluster_sequences(df, distance_matrix_dameraulev,
             do_umap=True, eps=0.1, min_samples=min_samples, min_cluster_size=min_cluster_size, n_neighbors=n_neighbors)

In [None]:
'''Chosen UMAP and HDBSCAN parameters'''

min_samples = 8
min_cluster_size = 6
n_neighbors = 5

df = tptlabel_dr_df_filt

print(f'min_samples = {min_samples}')
print(f'min_cluster_size = {min_cluster_size}')
print(f'n_neighbors = {n_neighbors}')
tptlabel_dr_df_filt_clusteredtrajectories = cp.cluster_sequences(df, distance_matrix_dameraulev,
 do_umap=True, eps=0.1, min_samples=min_samples, min_cluster_size=min_cluster_size, n_neighbors=n_neighbors)

Get the fingerprint plot of trajectories

In [None]:
df = tptlabel_dr_df_filt_clusteredtrajectories

cluster_purity_df = cp.purity_pointsinclusterspercondition(df, cluster_label='trajectory_id') 
f = cp.purityplot_percentcluspercondition(df, cluster_purity_df, cluster_label='trajectory_id', dotsize = 30) 

 ### Disambiguate the trajectory clustered cells:
 1) Make an exemplar_df_trajectories containing example rows
 2) Get the full tracks from those rows and make exemplar_df_trajectories_fulltrack
 2) Disambiguate with exemplar_df_trajectories
 3) Plot multiples with exemplar_df_trajectories_fulltrack

In [None]:
df = tptlabel_dr_df_filt_clusteredtrajectories
exemplar_df_trajectories, exemplar_df_trajectories_fulltrack  = cp.make_exemplar_df_basedon_trajectories(df, cells_per_traj=6)

In [None]:
# full_tracks_df = pd.read_csv(cp.SAVED_DATA_PATH + 'full_tracks_df.csv')
FONT_SIZE = 10
df = exemplar_df_trajectories_fulltrack
cp.plot_trajectories(df=exemplar_df_trajectories_fulltrack, global_y=True, global_x=True)

In [None]:
size=300 #


df= tptlabel_dr_df_filt_clusteredtrajectories 
exemp_df=exemplar_df_trajectories 

top_dictionary, contributions_df_singletpoints, scaled_df=cp.contribution_to_clusters(df_in=tptlabel_dr_df,  howmanyfactors=2, dr_factors= chosen_dr_factors) #BEFORE disambiguate_tavg(), then: lab_tavg_dr_df BEFORE disambiguate_timepoint(), then: #tptlabel_dr_df 
cp.plot_cluster_averages(top_dictionary, df, scaled_df)
result_df = cp.create_cluster_averages_table(top_dictionary, df, scaled_df)
# same as before - this can't be run (below, commented out) without the contours (segmentations), which you don't have here
# cp.disambiguate_timepoint(df, exemp_df, scaled_df, top_dictionary=top_dictionary, XYRange=size,boxoff=True, trajectory = True) 

# Percent fingerprint plot for cluster IDs per TRAJECTORY

In [None]:
# tptlabel_dr_df_filt_clusteredtrajectories = pd.read_csv(cp.SAVED_DATA_PATH + 'tptlabel_dr_df_filt_clusteredtrajectories_FINAL_10-12-2023.csv')

df = tptlabel_dr_df_filt_clusteredtrajectories
cp.fingerprintplot_clusters_per_trajectory(df)

# Plasticity of cells per trajectory

In [None]:
tptlabel_dr_df_filt_clusteredtrajectories = pd.read_csv(cp.SAVED_DATA_PATH + 'tptlabel_dr_df_filt_clusteredtrajectories_FINAL_10-17-2023.csv')

In [None]:
df=tptlabel_dr_df_filt_clusteredtrajectories
cp.plasticity_per_trajectory(df)

In [None]:
df=tptlabel_dr_df_filt_clusteredtrajectories
# all='\_allcells'
cp.plot_plasticity_changes_trajectories(df, identifier='\_allcells', maxy=9 , t_window_multiplier = 1) #problem with NaNs in the data

# Animations of trajectories

In [None]:
df = tptlabel_dr_df_filt_clusteredtrajectories
cp.make_trajectory_animations(df, exemplar_df_trajectories, number_of_trajectories=2, colormode='cluster') # singlecluster, cluster, trajectory

Select a number of example cells from each trajectory ID to map back on to the data and display as stacks of PNGs

In [None]:
number_of_trajectories = 10 # Select a number of trajectories to plot

df = tptlabel_dr_df_filt_clusteredtrajectories

trajectory_ids = df['trajectory_id'].unique()

uniq_id_choices_list = []

for trajectory_id_choice in trajectory_ids:
    # for each trajectory_id, get a list of possible uniq_ids from the df
    uniq_id_choices = tptlabel_dr_df_filt_clusteredtrajectories[tptlabel_dr_df_filt_clusteredtrajectories['trajectory_id']==trajectory_id_choice]['uniq_id'].values
    # Make sure each once is unique in that list
    uniq_id_choices = np.unique(uniq_id_choices)
    # choose a number of random uniq_ids from that list based on number_of_trajectories
    uniq_id_choices = np.random.choice(uniq_id_choices, number_of_trajectories)
    # append each choice to a list
    uniq_id_choices_list.append(uniq_id_choices)
# flatten the list
chosen_uniq_ids = [item for sublist in uniq_id_choices_list for item in sublist]
    
print(chosen_uniq_ids)

In [None]:
df = tptlabel_dr_df_filt_clusteredtrajectories
cp.make_png_behaviour_trajectories(df,chosen_uniq_ids,XYRange = 300, follow_cell = False, invert=False)

In [None]:
df = tptlabel_dr_df_filt_clusteredtrajectories
cp.make_raw_cell_pngstacks(df,chosen_uniq_ids,XYRange = 220, follow_cell=False, invert=False)