# Main Focus of the Notebook: Filter Tracks, only keep tracks which are possible CME tracks 

In [1]:
from os import path
import pandas as pd
from IPython.display import display
from matplotlib import pyplot as plt
from skimage import io
import numpy as np 

plt.rcParams["font.family"] = ""

In [2]:
track_df = pd.read_pickle('/Users/apple/Desktop/Akamatsu_Lab/Lap_track/self/files/track_df_updated.pkl')

In [3]:
track_df.head()

Unnamed: 0,frame,index,amplitude,mu_x,mu_y,mu_z,sigma_x,sigma_y,sigma_z,frame_y,...,track_id,c2_amp,c2_peak,c2_peak_x,c2_peak_y,c2_peak_z,c3_mean_amp,c2_peak_mean,c2_voxel_sum,c3_voxel_sum
0,0,0,206.593249,11.0,12.0,0.0,2.0,2.0,4.0,0,...,0,122.947826,152.0,9.0,14.0,2.0,163.682353,125.5,11295,13913
1,0,1,191.0,36.0,88.0,1.0,2.0,1.0,2.0,0,...,1,116.233333,143.0,34.0,87.0,2.0,138.233333,114.826667,20068,19928
2,0,2,231.007679,40.0,141.0,1.0,2.0,2.0,2.0,0,...,2,122.33,142.0,41.0,142.0,0.0,153.94,122.853333,15316,22805
3,0,3,185.152538,40.0,58.0,1.0,2.0,1.0,3.0,0,...,3,117.626667,142.0,42.0,59.0,0.0,141.28,118.016667,14557,20358
4,0,4,202.0,46.0,73.0,1.0,2.0,2.0,3.0,0,...,4,115.112,141.0,45.0,74.0,3.0,147.768,115.502857,23194,21963


## Finding lengths of each of the track

In [4]:
lengths_df = track_df['track_id'].value_counts().reset_index()
lengths_df.columns = ['track_id', 'length']
lengths_df

Unnamed: 0,track_id,length
0,3344,63
1,212,60
2,286,51
3,7239,51
4,12615,49
...,...,...
18918,8179,1
18919,8180,1
18920,8182,1
18921,8189,1


## Finding channel 2 peak (c2_peak) and channel 3 peak (amplitude) for each track

In [5]:
#Group by 'track_id' and find the maximum value of 'c2_peak_max'
max_amp_per_track = track_df.groupby('track_id')[['c2_peak','amplitude']].max().reset_index()
filtered_tracks_1 = max_amp_per_track[max_amp_per_track['c2_peak'] > 170]
filtered_tracks_1

Unnamed: 0,track_id,c2_peak,amplitude
9,9,172.0,235.809569
17,17,201.0,226.502247
22,22,172.0,249.375311
27,27,181.0,254.000000
32,32,174.0,261.000000
...,...,...,...
18909,18909,188.0,268.000000
18911,18911,197.0,227.000000
18913,18913,200.0,228.000000
18916,18916,197.0,253.000000


## Merging the above dataframe with lengths dataframe

In [6]:
filtered_tracks_with_lengths = pd.merge(filtered_tracks_1,lengths_df, on='track_id')
filtered_tracks_with_lengths.columns = ['track_id', 'c2_peak', 'amplitude', 'length']
filtered_tracks_with_lengths.head()

Unnamed: 0,track_id,c2_peak,amplitude,length
0,9,172.0,235.809569,1
1,17,201.0,226.502247,8
2,22,172.0,249.375311,3
3,27,181.0,254.0,3
4,32,174.0,261.0,5


## Finding the starting frame of each track 

In [7]:
# Group by 'track_id' and find the minimum frame for each track
starting_frame_per_track = track_df.groupby('track_id')['frame'].min().reset_index()

starting_frame_per_track.columns = ['track_id', 'start_frame']
# Display the resulting DataFrame
starting_frame_per_track.head()

Unnamed: 0,track_id,start_frame
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


## Finding the ending frame of each dataframe

In [8]:
ending_frame_per_track = track_df.groupby('track_id')['frame'].max().reset_index()

ending_frame_per_track.columns = ['track_id', 'end_frame']
# Display the resulting DataFrame
ending_frame_per_track.head()

Unnamed: 0,track_id,end_frame
0,0,0
1,1,0
2,2,0
3,3,0
4,4,1


## Merging with the starting dataframe 
### Now we have peaks, length and starting frame 

In [9]:
filtered_tracks_with_start = pd.merge(filtered_tracks_with_lengths, starting_frame_per_track, on='track_id', how = 'inner')
filtered_tracks_with_start.head()

Unnamed: 0,track_id,c2_peak,amplitude,length,start_frame
0,9,172.0,235.809569,1,0
1,17,201.0,226.502247,8,0
2,22,172.0,249.375311,3,0
3,27,181.0,254.0,3,0
4,32,174.0,261.0,5,0


# Merging with the ending dataframe 
### Now we have peaks, length, starting frame, ending frame

In [10]:
filtered_tracks_with_start_end = pd.merge(filtered_tracks_with_start,ending_frame_per_track, on='track_id',how='inner')
filtered_tracks_with_start_end.head()

Unnamed: 0,track_id,c2_peak,amplitude,length,start_frame,end_frame
0,9,172.0,235.809569,1,0,0
1,17,201.0,226.502247,8,0,7
2,22,172.0,249.375311,3,0,2
3,27,181.0,254.0,3,0,2
4,32,174.0,261.0,5,0,4


## Created a temporary dataframe

In [11]:
temp = track_df[['frame', 'track_id', 'c2_peak']]
temp.head()

Unnamed: 0,frame,track_id,c2_peak
0,0,0,152.0
1,0,1,143.0
2,0,2,142.0
3,0,3,142.0
4,0,4,141.0


## Merged with the temporary dataframe to get the frame number at which c2_peak is achieved for each track

In [12]:
# Merge based on two columns ('column1' and 'column2')
final_df = pd.merge(filtered_tracks_with_start_end, temp, on=['track_id', 'c2_peak'], how='inner')

final_df.head()

Unnamed: 0,track_id,c2_peak,amplitude,length,start_frame,end_frame,frame
0,9,172.0,235.809569,1,0,0,0
1,17,201.0,226.502247,8,0,7,7
2,22,172.0,249.375311,3,0,2,0
3,27,181.0,254.0,3,0,2,0
4,32,174.0,261.0,5,0,4,2


## Now finding the time in terms of frames after which the peak of dynamin occurs 

In [13]:
final_df['peak_occur_c2'] = final_df['frame'] - final_df['start_frame']
final_df

Unnamed: 0,track_id,c2_peak,amplitude,length,start_frame,end_frame,frame,peak_occur_c2
0,9,172.0,235.809569,1,0,0,0,0
1,17,201.0,226.502247,8,0,7,7,7
2,22,172.0,249.375311,3,0,2,0,0
3,27,181.0,254.000000,3,0,2,0,0
4,32,174.0,261.000000,5,0,4,2,2
...,...,...,...,...,...,...,...,...
5215,18909,188.0,268.000000,1,129,129,129,0
5216,18911,197.0,227.000000,1,129,129,129,0
5217,18913,200.0,228.000000,1,129,129,129,0
5218,18916,197.0,253.000000,1,129,129,129,0


## Creating a temporary dataframe 

In [14]:
temp = track_df[['frame', 'track_id', 'amplitude']]
temp.head()

Unnamed: 0,frame,track_id,amplitude
0,0,0,206.593249
1,0,1,191.0
2,0,2,231.007679
3,0,3,185.152538
4,0,4,202.0


## Now finding the frame at which the peak for channel 3 occurs which is frame_y below 

In [15]:
final_df_2 = pd.merge(final_df, temp, on=['track_id', 'amplitude'], how='inner')

In [16]:
final_df_2

Unnamed: 0,track_id,c2_peak,amplitude,length,start_frame,end_frame,frame_x,peak_occur_c2,frame_y
0,9,172.0,235.809569,1,0,0,0,0,0
1,17,201.0,226.502247,8,0,7,7,7,7
2,22,172.0,249.375311,3,0,2,0,0,1
3,27,181.0,254.000000,3,0,2,0,0,2
4,32,174.0,261.000000,5,0,4,2,2,0
...,...,...,...,...,...,...,...,...,...
5231,18909,188.0,268.000000,1,129,129,129,0,129
5232,18911,197.0,227.000000,1,129,129,129,0,129
5233,18913,200.0,228.000000,1,129,129,129,0,129
5234,18916,197.0,253.000000,1,129,129,129,0,129


## Now finding the time in terms of frames after which the peak of clathrin occurs 

In [17]:
final_df_2['peak_occur_c3'] = final_df_2['frame_y'] - final_df_2['start_frame']

In [18]:
final_df_2.head()

Unnamed: 0,track_id,c2_peak,amplitude,length,start_frame,end_frame,frame_x,peak_occur_c2,frame_y,peak_occur_c3
0,9,172.0,235.809569,1,0,0,0,0,0,0
1,17,201.0,226.502247,8,0,7,7,7,7,7
2,22,172.0,249.375311,3,0,2,0,0,1,1
3,27,181.0,254.0,3,0,2,0,0,2,2
4,32,174.0,261.0,5,0,4,2,2,0,0


In [19]:
final_df_2.columns = ['track_id', 'c2_peak', 'amplitude','track_length', 
                      'start_frame', 'end_frame', 'peak_frame_c2', 'peak_start_c2', 'peak_frame_c3',
                     'peak_start_c3']

In [20]:
final_df_2.head()

Unnamed: 0,track_id,c2_peak,amplitude,track_length,start_frame,end_frame,peak_frame_c2,peak_start_c2,peak_frame_c3,peak_start_c3
0,9,172.0,235.809569,1,0,0,0,0,0,0
1,17,201.0,226.502247,8,0,7,7,7,7,7
2,22,172.0,249.375311,3,0,2,0,0,1,1
3,27,181.0,254.0,3,0,2,0,0,2,2
4,32,174.0,261.0,5,0,4,2,2,0,0


# FILTERS ARE APPLIED BELOW 

# Filter 1
## Drop all tracks which have dynamin peak within the first 3 frames 

In [21]:
filtered_final_df = final_df_2[(final_df_2['peak_start_c2'] > 3)]
filtered_final_df = filtered_final_df[(filtered_final_df['peak_start_c3'] > 3)]
filtered_final_df

Unnamed: 0,track_id,c2_peak,amplitude,track_length,start_frame,end_frame,peak_frame_c2,peak_start_c2,peak_frame_c3,peak_start_c3
1,17,201.0,226.502247,8,0,7,7,7,7,7
5,36,202.0,299.000000,34,0,33,17,17,25,25
17,73,224.0,345.835302,8,0,7,6,6,6,6
25,91,326.0,300.333333,21,0,20,19,19,17,17
32,105,173.0,308.000000,12,0,11,11,11,6,6
...,...,...,...,...,...,...,...,...,...,...
5028,17848,181.0,197.000000,7,121,127,126,5,127,6
5058,18006,185.0,221.635139,8,122,129,129,7,126,4
5082,18107,177.0,191.000000,7,123,129,129,6,127,4
5091,18143,187.0,226.485961,7,123,129,127,4,128,5


# Filter 2 
## Drop all tracks which have dynamin peak in last frame

In [22]:
filtered_final_df = filtered_final_df[filtered_final_df['end_frame'] != filtered_final_df['peak_frame_c2']]

In [23]:
filtered_final_df

Unnamed: 0,track_id,c2_peak,amplitude,track_length,start_frame,end_frame,peak_frame_c2,peak_start_c2,peak_frame_c3,peak_start_c3
5,36,202.0,299.000000,34,0,33,17,17,25,25
17,73,224.0,345.835302,8,0,7,6,6,6,6
25,91,326.0,300.333333,21,0,20,19,19,17,17
48,144,210.0,335.333333,18,0,17,13,13,17,17
49,145,181.0,258.333333,20,0,19,14,14,10,10
...,...,...,...,...,...,...,...,...,...,...
5001,17739,188.0,234.000000,10,120,129,126,6,124,4
5008,17791,188.0,202.000000,7,120,126,124,4,126,6
5028,17848,181.0,197.000000,7,121,127,126,5,127,6
5091,18143,187.0,226.485961,7,123,129,127,4,128,5


# Filter 3
## Drop all tracks which have clathrin peak in last frame

In [24]:
filtered_final_df = filtered_final_df[filtered_final_df['end_frame'] != filtered_final_df['peak_frame_c3']]

In [25]:
filtered_final_df

Unnamed: 0,track_id,c2_peak,amplitude,track_length,start_frame,end_frame,peak_frame_c2,peak_start_c2,peak_frame_c3,peak_start_c3
5,36,202.0,299.000000,34,0,33,17,17,25,25
17,73,224.0,345.835302,8,0,7,6,6,6,6
25,91,326.0,300.333333,21,0,20,19,19,17,17
49,145,181.0,258.333333,20,0,19,14,14,10,10
58,155,231.0,335.000000,12,0,11,4,4,7,7
...,...,...,...,...,...,...,...,...,...,...
4979,17656,203.0,213.000000,7,119,125,124,5,124,5
4987,17686,171.0,208.000000,9,119,127,126,7,125,6
5001,17739,188.0,234.000000,10,120,129,126,6,124,4
5091,18143,187.0,226.485961,7,123,129,127,4,128,5


In [26]:
filtered_final_df.columns = ['track_id', 'c2_peak', 'c3_peak', 'track_length', 'start_frame', 
                         'end_frame', 'peak_frame_c2', 'peak_start_c2', 'peak_frame_c3', 'peak_start_c3']

In [27]:
filtered_final_df

Unnamed: 0,track_id,c2_peak,c3_peak,track_length,start_frame,end_frame,peak_frame_c2,peak_start_c2,peak_frame_c3,peak_start_c3
5,36,202.0,299.000000,34,0,33,17,17,25,25
17,73,224.0,345.835302,8,0,7,6,6,6,6
25,91,326.0,300.333333,21,0,20,19,19,17,17
49,145,181.0,258.333333,20,0,19,14,14,10,10
58,155,231.0,335.000000,12,0,11,4,4,7,7
...,...,...,...,...,...,...,...,...,...,...
4979,17656,203.0,213.000000,7,119,125,124,5,124,5
4987,17686,171.0,208.000000,9,119,127,126,7,125,6
5001,17739,188.0,234.000000,10,120,129,126,6,124,4
5091,18143,187.0,226.485961,7,123,129,127,4,128,5


In [28]:
file_path = '/Users/apple/Desktop/Akamatsu_Lab/Lap_track/self/files/filtered_tracks.pkl'
filtered_final_df.to_pickle(file_path)

## Column Name meanings 
1. **start_frame**: the frame number at which the track starts 
2. **end_frame**: the frame number at which the track ends
3. **peak_frame_c2 or peak_frame_c3**: this is the frame number at which the peak occurs within a track. Note that this number is always between the start_frame and end_frame. 
4. **peak_start_c2 or peak_start_c3**: this is the (peak_frame_c2 - start_frame) or (peak_frame_c3 - start_frame). Basically this tells us that after a track started then after how many frames was the peak hit. It is used for filtering tracks  